<a href="https://colab.research.google.com/github/nanafish/ORS/blob/main/%E9%9B%86%E5%AE%A2%E7%8E%87%E6%94%B9%E8%89%AF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# 1セル完全版：UIで条件(複数選択OK) → 実行（集計→CSV/XLSX/点CSV/KMZ）
#
# ✅複数選択ルール
#  - 同じ列内：OR（選択肢のどれかに一致）
#  - 列をまたぐ：AND（全部の列条件を満たす）
#
# ✅年齢：候補値で複数選択
#  - 数値年齢優勢 → 年齢候補=整数（例 20,21,22...）を複数選択
#     分母(人口)は「選択年齢のmin/maxレンジ」で年齢階級列を合算（※近似）
#  - 文字カテゴリ優勢 → 候補値をそのまま複数選択
#     分母は人口表の年齢列名と一致/部分一致する列を合算（できる限り厳密）
#
# 必要ファイル：
#  - AJ-*.csv（来場CSV複数）
#  - utf_ken_all.csv
#  - 【総計】市区町村別年齢階級別人口(2025.8).xlsx
#  - N03-20240101.geojson
# ============================================

print("✅ install")
!pip -q install pandas shapely openpyxl ijson chardet ipywidgets
print("✅ done")

import os, re, glob, math, zipfile
import pandas as pd
import numpy as np
import chardet
import ijson
import ipywidgets as widgets
from shapely.geometry import shape
from IPython.display import display, clear_output

# =========================
# ===== 基本設定 =====
# =========================
VISITOR_GLOB   = "AJ-*.csv"
KEN_ALL_CSV    = "utf_ken_all.csv"
POP_XLSX       = "【総計】市区町村別年齢階級別人口(2025.8).xlsx"
N03_GEOJSON    = "N03-20240101.geojson"

# （任意）既婚・未婚を分母にも反映したい場合だけ指定（無ければ分子のみ）
MARITAL_POP_XLSX = None  # 例: "市区町村別_婚姻人口.xlsx"

OUT_PREFIX     = "市区町村"
BASE_SQUARE_KM = 5.0
SQUARE_SIDE_KM = BASE_SQUARE_KM * 0.65   # 65% => 3.25km
MAX_PER_FILE   = 1800
ALPHA_HEX      = "7f"
Z_TARGET_COL   = "rate_per10k"

# 値候補抽出（重さ対策）
SAMPLE_ROWS_FOR_VALUES = 30000   # 候補抽出の最大行数（全CSV合算）
TOP_N_VALUES = 150              # 候補値の頻出上位
ENC_FALLBACKS = ["cp932","shift_jis","utf-8-sig","utf-8"]

# =========================
# ===== 便利関数 =====
# =========================
def detect_encoding(path, nbytes=200000):
    with open(path, "rb") as f:
        raw = f.read(nbytes)
    return chardet.detect(raw).get("encoding") or "utf-8"

def read_csv_flex(path, nrows=None):
    enc_try = [detect_encoding(path)] + ENC_FALLBACKS
    tried=set()
    for e in enc_try:
        if e in tried:
            continue
        tried.add(e)
        try:
            return pd.read_csv(path, encoding=e, nrows=nrows)
        except Exception:
            continue
    return pd.read_csv(path, encoding="cp932", errors="ignore", nrows=nrows)

def find_zip_col(columns):
    cols = list(columns)
    for k in ["郵便番号", "郵便", "〒", "zip", "ZIP", "Zip", "zipcode", "post"]:
        for c in cols:
            if k in str(c):
                return c
    for c in cols:
        s = str(c)
        if ("番号" in s) and ("郵" in s):
            return c
    return None

def norm_zip(x):
    if pd.isna(x):
        return None
    s = re.sub(r"\D", "", str(x))
    if len(s) == 7: return s
    if len(s) == 6: return "0"+s
    if len(s) > 7:  return s[:7]
    return None

def muni_key(pref, muni):
    if pref is None or muni is None:
        return None
    return str(pref).strip() + str(muni).strip()

def safe_int(x):
    if pd.isna(x):
        return None
    s = re.sub(r"[^\d]", "", str(x))
    return int(s) if s else None

def esc_xml(s):
    s = str(s)
    return (s.replace("&","&amp;").replace("<","&lt;").replace(">","&gt;")
             .replace('"',"&quot;").replace("'","&apos;"))

def stats_series(s, name):
    s = pd.to_numeric(s, errors="coerce").dropna()
    n = int(s.count())
    if n == 0:
        return {"対象": name, "件数(n)": 0}
    std = float(s.std(ddof=1)) if n > 1 else np.nan
    return {
        "対象": name, "件数(n)": n,
        "平均": float(s.mean()),
        "標準偏差": std,
        "分散": float(s.var(ddof=1)) if n > 1 else np.nan,
        "最小": float(s.min()),
        "25%点": float(s.quantile(0.25)),
        "中央値": float(s.median()),
        "75%点": float(s.quantile(0.75)),
        "最大": float(s.max()),
        "歪度(skew)": float(s.skew()) if n > 2 else np.nan,
        "尖度(kurtosis)": float(s.kurt()) if n > 3 else np.nan,
        "標準誤差(SEM)": float(std / math.sqrt(n)) if (n > 1 and std==std) else np.nan
    }

def make_square(lon, lat, side_km):
    half = side_km / 2.0
    dlat = half / 111.32
    dlon = half / (111.32 * math.cos(math.radians(lat)) + 1e-12)
    return [
        (lon - dlon, lat - dlat),
        (lon + dlon, lat - dlat),
        (lon + dlon, lat + dlat),
        (lon - dlon, lat + dlat),
        (lon - dlon, lat - dlat),
    ]

def kml_color_rgba(r, g, b, a255):
    return f"{a255:02x}{b:02x}{g:02x}{r:02x}"  # aabbggrr

def normalize_pref_filter(x):
    if x is None:
        return None
    if isinstance(x, (list, tuple, set)):
        vals = [str(v).strip() for v in x if str(v).strip()]
        return sorted(set(vals)) if vals else None
    if isinstance(x, str):
        s = x.strip()
        if s == "":
            return None
        if "," in s:
            vals = [p.strip() for p in s.split(",") if p.strip()]
            return sorted(set(vals)) if vals else None
        return [s]
    return None

def norm_sex(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return None
    s = str(v).strip().replace("　"," ").replace("\u3000"," ").strip()
    s = s.replace("男性","男").replace("女性","女")
    if s in ["男","m","M","male","MALE","man","MAN"]:
        return "男"
    if s in ["女","f","F","female","FEMALE","woman","WOMAN"]:
        return "女"
    if s in ["計","合計","総数","全体","全","ALL","all"]:
        return "計"
    return s

def norm_marital(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return None
    s = str(v).strip().replace("　"," ").replace("\u3000"," ").strip()
    if ("既" in s) and ("婚" in s): return "既婚"
    if ("未" in s) and ("婚" in s): return "未婚"
    return s

def norm_age_int(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return None
    s = str(v)
    m = re.search(r"(\d{1,3})", s)
    if not m:
        return None
    a = int(m.group(1))
    if a < 0 or a > 120:
        return None
    return a

# =========================
# ===== 入力存在チェック =====
# =========================
for rf in [KEN_ALL_CSV, POP_XLSX, N03_GEOJSON]:
    if not os.path.exists(rf):
        raise RuntimeError(f"必要ファイルが見つかりません: {rf}")

visitor_files = sorted([p for p in glob.glob(VISITOR_GLOB) if os.path.basename(p) != os.path.basename(KEN_ALL_CSV)])
if not visitor_files:
    raise RuntimeError(f"来場CSVが見つかりません: {VISITOR_GLOB}")

print("✅ 来場CSV:", len(visitor_files), "files")

# =========================
# ===== 全CSV列名を収集 =====
# =========================
all_cols=set()
col_count={}
for fp in visitor_files:
    try:
        h = read_csv_flex(fp, nrows=0)
        for c in h.columns:
            cs=str(c).strip()
            all_cols.add(cs)
            col_count[cs]=col_count.get(cs,0)+1
    except Exception:
        pass
cols_sorted = sorted(list(all_cols), key=lambda c: (-col_count.get(c,0), c))
print("✅ unique columns:", len(cols_sorted))

# =========================
# ===== 列候補（揺れ対策）=====
# =========================
COL_ALIAS = {
    "性別": ["性別","男女","性"],
    "年齢": ["年齢","年代","年令","年"],
    "既婚・未婚": ["既婚・未婚","婚姻","婚姻状況","結婚","未婚既婚"],
    "職業": ["職業","仕事","お仕事"],
    "ご希望の予約特典": ["ご希望の予約特典","予約特典","特典"],
    "イベントへのご来展回数": ["イベントへのご来展回数","来展回数","来場回数","来展"],
    "イベントを知ったきっかけ": ["イベントを知ったきっかけ","知ったきっかけ","認知経路","きっかけ"],
    "イベントを知ったきっかけ（その他の情報）": ["イベントを知ったきっかけ（その他の情報）","きっかけ（その他）","認知経路（その他）"],
    "出展作家を知ったきっかけ": ["出展作家を知ったきっかけ","作家を知ったきっかけ"],
    "出展作家を知ったきっかけ（以前から知っていた）": ["出展作家を知ったきっかけ（以前から知っていた）","以前から知っていた"],
}

def suggest_cols(key):
    cands = COL_ALIAS.get(key, [key])
    hits=[]
    for c in cols_sorted:
        for a in cands:
            if str(a) == str(c) or str(a) in str(c):
                hits.append(c)
                break
    return hits if hits else cols_sorted

def sample_series_for_column(col, max_rows_total=SAMPLE_ROWS_FOR_VALUES):
    vals=[]
    used=0
    for fp in visitor_files:
        if used >= max_rows_total:
            break
        try:
            df = read_csv_flex(fp, nrows=min(5000, max_rows_total-used))
            if col not in df.columns:
                continue
            used += len(df)
            s = df[col].dropna()
            vals.append(s)
        except Exception:
            continue
    if not vals:
        return pd.Series([], dtype="object")
    s = pd.concat(vals, ignore_index=True)
    return s

def top_values(col):
    s = sample_series_for_column(col)
    if s.empty:
        return []
    s = s.astype(str).str.strip()
    s = s[s!=""]
    vc = s.value_counts()
    return list(vc.head(TOP_N_VALUES).index)

def age_candidates(col):
    """
    年齢列の候補を作る（数値優勢なら整数の候補、そうでなければ文字カテゴリ）
    return: (mode, options)
      mode in ["numeric","categorical"]
    """
    s = sample_series_for_column(col)
    if s.empty:
        return ("categorical", [])
    s_str = s.astype(str).str.strip()
    s_str = s_str[s_str!=""]

    ages = s_str.map(norm_age_int)
    numeric_ratio = float(ages.notna().mean()) if len(ages)>0 else 0.0

    if numeric_ratio >= 0.6:
        # 数値年齢として扱う：頻出上位の年齢（小さい順）
        vc = ages.dropna().astype(int).value_counts()
        top = list(vc.head(TOP_N_VALUES).index)
        top = sorted(set([int(x) for x in top if 0 <= int(x) <= 120]))
        return ("numeric", top)
    else:
        # 文字カテゴリとして扱う：頻出上位
        vc = s_str.value_counts()
        top = list(vc.head(TOP_N_VALUES).index)
        return ("categorical", top)

# =========================
# ===== 色設定 =====
# =========================
a255 = int(ALPHA_HEX, 16)
COL_RED    = kml_color_rgba(220,  60,  60, a255)
COL_YELLOW = kml_color_rgba(255, 220,  80, a255)
COL_BLUE   = kml_color_rgba( 80, 160, 255, a255)
COL_GRAY   = kml_color_rgba(180, 180, 180, a255)
LINE_COLOR = kml_color_rgba(100, 100, 100, 255)

def pick_color_by_quart(q):
    q = int(q)
    if q == 1: return COL_RED
    if q == 2: return COL_YELLOW
    if q == 3: return COL_BLUE
    return COL_GRAY

# =========================
# ===== UI =====
# =========================
log = widgets.Output()
out = widgets.Output()

pref_text = widgets.Text(
    value="",
    description="PREF_FILTER",
    placeholder="例: 東京都 / 東京都,千葉県,埼玉県（空なら全国）",
    layout=widgets.Layout(width="950px")
)

seg_col_dd = {
    "性別": widgets.Dropdown(options=suggest_cols("性別"), description="性別列", layout=widgets.Layout(width="950px")),
    "年齢": widgets.Dropdown(options=suggest_cols("年齢"), description="年齢列", layout=widgets.Layout(width="950px")),
    "既婚・未婚": widgets.Dropdown(options=suggest_cols("既婚・未婚"), description="既婚列", layout=widgets.Layout(width="950px")),
}

# 値は全部 SelectMultiple（=複数選択OK）
sex_ms = widgets.SelectMultiple(options=[], description="性別値(OR)", layout=widgets.Layout(width="950px", height="120px"))
age_ms = widgets.SelectMultiple(options=[], description="年齢値(OR)", layout=widgets.Layout(width="950px", height="160px"))
mar_ms = widgets.SelectMultiple(options=[], description="婚姻値(OR)", layout=widgets.Layout(width="950px", height="120px"))

age_mode_label = widgets.HTML("<b>年齢モード</b>: 未抽出")

other_cols = widgets.SelectMultiple(
    options=cols_sorted,
    description="分子のみ列(複数)",
    layout=widgets.Layout(width="950px", height="190px")
)
other_values_area = widgets.Output()
other_value_boxes = {}

btn_load_seg = widgets.Button(description="① 性別/年齢/婚姻の候補値を抽出", button_style="info")
btn_run = widgets.Button(description="② 実行（集計→CSV/XLSX/点CSV/KMZ）", button_style="success")

def rebuild_other_value_ui(selected_cols):
    other_value_boxes.clear()
    items=[]
    for c in selected_cols:
        search = widgets.Text(value="", placeholder="検索（contains）", description=f"{c}検索", layout=widgets.Layout(width="950px"))
        ms = widgets.SelectMultiple(options=[], description=f"{c}値(OR)", layout=widgets.Layout(width="950px", height="140px"))
        btn = widgets.Button(description=f"{c}の候補値抽出", button_style="")
        box = widgets.VBox([search, ms, btn])
        other_value_boxes[c] = {"search":search, "ms":ms, "btn":btn, "box":box}

        def make_onclick(col):
            def _onclick(b):
                opts = top_values(col)
                other_value_boxes[col]["ms"].options = opts
                q = other_value_boxes[col]["search"].value.strip()
                if q:
                    other_value_boxes[col]["ms"].options = [x for x in opts if q in str(x)]
                with log:
                    print(f"✅ 候補抽出: {col} / {len(opts)}件（頻出上位）")
            return _onclick

        def make_search_observer(col):
            def _obs(change):
                if change["name"] != "value":
                    return
                opts = list(other_value_boxes[col]["ms"].options)
                if not opts:
                    return
                q = str(change["new"]).strip()
                if q == "":
                    return
                other_value_boxes[col]["ms"].options = [x for x in opts if q in str(x)]
            return _obs

        btn.on_click(make_onclick(c))
        search.observe(make_search_observer(c))
        items.append(box)
    return widgets.VBox(items)

def on_other_cols_change(change):
    if change["name"] != "value":
        return
    with other_values_area:
        clear_output()
        selected = list(change["new"])
        if not selected:
            print("分子のみ列が未選択です（必要なら選択）")
            return
        display(rebuild_other_value_ui(selected))
        print("✅ 必要な列だけ『候補値抽出』→ 値を複数選択（OR）")

other_cols.observe(on_other_cols_change)

# 年齢モード保持
AGE_MODE = {"mode":"categorical"}

def refresh_segment_candidates(_=None):
    with log:
        print("… 性別/年齢/婚姻の候補値を抽出中（頻出上位）")

    # 性別
    sex_col = seg_col_dd["性別"].value
    sex_ms.options = top_values(sex_col)

    # 年齢
    age_col = seg_col_dd["年齢"].value
    mode, opts = age_candidates(age_col)
    AGE_MODE["mode"] = mode
    age_ms.options = opts
    age_mode_label.value = f"<b>年齢モード</b>: {('numeric（数値年齢）' if mode=='numeric' else 'categorical（カテゴリ文字列）')}"

    # 婚姻
    mar_col = seg_col_dd["既婚・未婚"].value
    mar_ms.options = top_values(mar_col)

    with log:
        print(f"✅ 性別列: {sex_col} / 候補 {len(sex_ms.options)}")
        print(f"✅ 年齢列: {age_col} / mode={AGE_MODE['mode']} / 候補 {len(age_ms.options)}")
        print(f"✅ 婚姻列: {mar_col} / 候補 {len(mar_ms.options)}")

btn_load_seg.on_click(refresh_segment_candidates)

# =========================
# ===== 実行本体 =====
# =========================
def run_pipeline(_=None):
    with out:
        clear_output()

    # -------- UIから条件取得 --------
    s = pref_text.value.strip()
    PREF_FILTER = None
    if s:
        PREF_FILTER = [p.strip() for p in s.split(",") if p.strip()] if "," in s else s
    PREFS = normalize_pref_filter(PREF_FILTER)
    scope_name = "全国" if not PREFS else "＋".join(PREFS)

    SEGMENT_FILTERS = {"性別": None, "年齢": None, "既婚・未婚": None, "_age_mode": AGE_MODE["mode"]}
    SEGMENT_FILTERS["性別"] = list(sex_ms.value) if sex_ms.value else None
    SEGMENT_FILTERS["年齢"] = list(age_ms.value) if age_ms.value else None
    SEGMENT_FILTERS["既婚・未婚"] = list(mar_ms.value) if mar_ms.value else None

    OTHER_FILTERS = {}
    for c, d in other_value_boxes.items():
        chosen = list(d["ms"].value) if d["ms"].value else []
        if chosen:
            OTHER_FILTERS[c] = chosen

    # 出力タグ
    cond_tag=[]
    if SEGMENT_FILTERS["性別"] is not None: cond_tag.append("性別")
    if SEGMENT_FILTERS["年齢"] is not None: cond_tag.append("年齢")
    if SEGMENT_FILTERS["既婚・未婚"] is not None: cond_tag.append("婚姻")
    if OTHER_FILTERS: cond_tag.append("その他")
    cond_tag = "条件" + ("_" + "-".join(cond_tag) if cond_tag else "")

    with out:
        print("✅ 範囲:", scope_name)
        print("✅ 条件（列内OR / 列間AND）")
        print("  性別:", SEGMENT_FILTERS["性別"])
        print("  年齢:", SEGMENT_FILTERS["年齢"], f"(mode={SEGMENT_FILTERS['_age_mode']})")
        print("  婚姻:", SEGMENT_FILTERS["既婚・未婚"])
        print("  分子のみ:", {k: len(v) for k,v in OTHER_FILTERS.items()})
        print("✅ 出力タグ:", cond_tag)

    # -------- 郵便番号→市区町村キー --------
    ken = pd.read_csv(KEN_ALL_CSV, header=None, dtype=str, encoding="utf-8-sig")
    ken_zip  = ken.iloc[:, 2].astype(str).str.zfill(7)
    ken_pref = ken.iloc[:, 6].astype(str).str.strip()
    ken_city = ken.iloc[:, 7].astype(str).str.strip()

    zip_to_key  = {}
    zip_to_pref = {}
    for z, p, c in zip(ken_zip, ken_pref, ken_city):
        if z and p and c:
            zip_to_key[z]  = muni_key(p, c)
            zip_to_pref[z] = p

    def zip_allowed(z):
        if not PREFS:
            return True
        return zip_to_pref.get(z) in PREFS

    # -------- 来場CSV → フィルタ → 市区町村count --------
    sex_col = seg_col_dd["性別"].value
    age_col = seg_col_dd["年齢"].value
    mar_col = seg_col_dd["既婚・未婚"].value

    # 事前にセット化（列内OR）
    sex_set = set([norm_sex(x) for x in (SEGMENT_FILTERS["性別"] or [])])
    mar_set = set([norm_marital(x) for x in (SEGMENT_FILTERS["既婚・未婚"] or [])])

    age_mode = SEGMENT_FILTERS["_age_mode"]
    if SEGMENT_FILTERS["年齢"] is None:
        age_set = None
    else:
        if age_mode == "numeric":
            age_set = set([int(x) for x in SEGMENT_FILTERS["年齢"]])
        else:
            age_set = set([str(x).strip() for x in SEGMENT_FILTERS["年齢"]])

    def apply_row_filters(df):
        mask = pd.Series(True, index=df.index)

        # 性別（列内OR）
        if SEGMENT_FILTERS["性別"] is not None:
            if sex_col not in df.columns:
                raise KeyError(f"来場CSVに性別列が無い: {sex_col}")
            s = df[sex_col].map(norm_sex)
            mask &= s.isin(sex_set)

        # 年齢（列内OR）
        if SEGMENT_FILTERS["年齢"] is not None:
            if age_col not in df.columns:
                raise KeyError(f"来場CSVに年齢列が無い: {age_col}")
            if age_mode == "numeric":
                a = df[age_col].map(norm_age_int)
                mask &= a.notna() & a.astype(int).isin(age_set)
            else:
                s = df[age_col].astype(str).str.strip()
                # 文字カテゴリは「完全一致」優先、必要なら contains にしたければ下を変更
                mask &= s.isin(age_set)

        # 婚姻（列内OR）
        if SEGMENT_FILTERS["既婚・未婚"] is not None:
            if mar_col not in df.columns:
                raise KeyError(f"来場CSVに婚姻列が無い: {mar_col}")
            s = df[mar_col].map(norm_marital)
            mask &= s.isin(mar_set)

        # OTHER（列内OR / 列間AND）
        for k, vals in OTHER_FILTERS.items():
            if k not in df.columns:
                raise KeyError(f"来場CSVに列が無い: {k}")
            s = df[k].astype(str).str.strip()
            vv = [str(x).strip() for x in vals]
            # OR：一致 or contains（運用上は contains が強いので採用）
            mm = pd.Series(False, index=df.index)
            for t in vv:
                mm |= (s == t) | s.str.contains(t, na=False)
            mask &= mm

        return mask

    muni_counts={}
    total_rows=0
    kept_rows=0
    skipped_files=0

    for fp in visitor_files:
        df = read_csv_flex(fp, nrows=None)
        zcol = find_zip_col(df.columns)
        if zcol is None:
            skipped_files += 1
            continue
        total_rows += len(df)

        try:
            m = apply_row_filters(df)
        except KeyError as e:
            skipped_files += 1
            with out:
                print(f"[SKIP] {os.path.basename(fp)} : {e}")
            continue

        df2 = df.loc[m].copy()
        kept_rows += len(df2)

        zips = df2[zcol].map(norm_zip)
        if PREFS:
            zips = zips.map(lambda z: z if (z and zip_allowed(z)) else None)

        keys = zips.map(lambda z: zip_to_key.get(z) if z else None)
        vc = keys.value_counts(dropna=True)
        for k, v in vc.items():
            if k is None or (isinstance(k, float) and pd.isna(k)):
                continue
            muni_counts[k] = muni_counts.get(k, 0) + int(v)

    in_scope = sum(muni_counts.values())
    unknown = max(kept_rows - in_scope, 0)
    with out:
        print(f"✅ 読込CSV:{len(visitor_files)} / SKIP:{skipped_files}")
        print(f"✅ 行数合計:{total_rows} / フィルタ後:{kept_rows} / 範囲内紐づき:{in_scope} / 不明推定:{unknown}")

    # -------- 人口（分母）作成：性別/年齢（＋任意で婚姻） --------
    pop = pd.read_excel(POP_XLSX, sheet_name=0, header=1, dtype=str)
    pref_col = pop.columns[1]
    muni_col = pop.columns[2]
    sex_pop_col = pop.columns[3]

    # 年齢階級列推定
    def detect_age_cols(df):
        cols=[]
        for c in df.columns:
            if str(c).strip() in ["総数","総計","計"]:
                continue
            s=str(c)
            if re.search(r"\d", s) and (("歳" in s) or ("～" in s) or ("-" in s) or ("代" in s) or ("以上" in s) or ("未満" in s)):
                cols.append(c)
        return cols

    AGE_COLS = detect_age_cols(pop)
    TOTAL_COL = None
    for c in pop.columns:
        if str(c).strip() == "総数":
            TOTAL_COL = c
            break
    if TOTAL_COL is None:
        TOTAL_COL = "__TOTAL_FALLBACK__"
        pop[TOTAL_COL] = 0
        for c in AGE_COLS:
            pop[TOTAL_COL] = pd.to_numeric(pop[TOTAL_COL], errors="coerce").fillna(0) + pd.to_numeric(pop[c], errors="coerce").fillna(0)

    def parse_age_band(label):
        s=str(label).replace("歳","").replace(" ", "").replace("　","")
        m=re.search(r"(\d+)[～\-](\d+)", s)
        if m: return int(m.group(1)), int(m.group(2))
        m=re.search(r"(\d+)以上", s)
        if m: return int(m.group(1)), None
        m=re.search(r"(\d+)未満", s)
        if m: return 0, int(m.group(1))-1
        m=re.search(r"(\d+)代", s)
        if m:
            lo=int(m.group(1))
            return lo, lo+9
        return None

    AGE_BANDS={}
    for c in AGE_COLS:
        band=parse_age_band(c)
        if band: AGE_BANDS[c]=band

    def choose_age_cols_by_range(age_min, age_max):
        chosen=[]
        for c,(lo,hi) in AGE_BANDS.items():
            if hi is None:
                if age_max >= lo:
                    chosen.append(c)
            else:
                if not (hi < age_min or lo > age_max):
                    chosen.append(c)
        return chosen if chosen else None

    def choose_age_cols_by_labels(labels):
        # 人口表の列名に一致/部分一致するものを集める
        chosen=[]
        for t in labels:
            tt=str(t).strip()
            for c in AGE_COLS:
                if tt == str(c) or tt in str(c) or str(c) in tt:
                    chosen.append(c)
        chosen = list(dict.fromkeys(chosen))
        return chosen if chosen else None

    pop2 = pop[(pop[muni_col].notna()) & (pop[muni_col] != "-")].copy()
    pop2["pref"] = pop2[pref_col].astype(str).str.strip()
    pop2["muni"] = pop2[muni_col].astype(str).str.strip()
    pop2["sex_norm"] = pop2[sex_pop_col].map(norm_sex)
    pop2["key"] = pop2.apply(lambda r: muni_key(r["pref"], r["muni"]), axis=1)

    if PREFS:
        pop2 = pop2[pop2["pref"].isin(PREFS)].copy()

    # 性別（分母）
    sex_spec = SEGMENT_FILTERS["性別"]
    if sex_spec is None:
        pop2b = pop2[pop2["sex_norm"] == "計"].copy()
        sex_note = "計"
    else:
        sex_targets = set([norm_sex(x) for x in sex_spec])
        pop2b = pop2[pop2["sex_norm"].isin(sex_targets)].copy()
        sex_note = ",".join(sorted(sex_targets))

    # 年齢（分母）
    age_spec = SEGMENT_FILTERS["年齢"]
    age_note = "総数"
    if age_spec is None:
        pop2b[TOTAL_COL] = pd.to_numeric(pop2b[TOTAL_COL], errors="coerce").fillna(0)
        pop2b["pop_num"] = pop2b[TOTAL_COL]
    else:
        if age_mode == "numeric":
            # 数値年齢：選択集合を min/max に圧縮して人口階級を合算（近似）
            amin = int(min(age_set))
            amax = int(max(age_set))
            cols = choose_age_cols_by_range(amin, amax)
            if cols:
                for c in cols:
                    pop2b[c] = pd.to_numeric(pop2b[c], errors="coerce").fillna(0)
                pop2b["pop_num"] = pop2b[cols].sum(axis=1)
                age_note = f"数値年齢選択→min/max近似({amin}-{amax}) 列{len(cols)}"
            else:
                pop2b[TOTAL_COL] = pd.to_numeric(pop2b[TOTAL_COL], errors="coerce").fillna(0)
                pop2b["pop_num"] = pop2b[TOTAL_COL]
                age_note = "数値年齢だが人口列マッチ無し→総数"
        else:
            # 文字カテゴリ：人口表の年齢列名に合わせて合算（できる限り厳密）
            cols = choose_age_cols_by_labels(age_spec)
            if cols:
                for c in cols:
                    pop2b[c] = pd.to_numeric(pop2b[c], errors="coerce").fillna(0)
                pop2b["pop_num"] = pop2b[cols].sum(axis=1)
                age_note = f"カテゴリ一致 列{len(cols)}"
            else:
                pop2b[TOTAL_COL] = pd.to_numeric(pop2b[TOTAL_COL], errors="coerce").fillna(0)
                pop2b["pop_num"] = pop2b[TOTAL_COL]
                age_note = "カテゴリだが人口列マッチ無し→総数"

    # 婚姻（分母：任意、無ければ分子のみ）
    mar_spec = SEGMENT_FILTERS["既婚・未婚"]
    marital_note = "指定なし"
    if mar_spec is not None:
        marital_note = "分子のみ（人口表なし）"
        if MARITAL_POP_XLSX is not None and os.path.exists(MARITAL_POP_XLSX):
            mar_df = pd.read_excel(MARITAL_POP_XLSX, sheet_name=0, dtype=str)

            def pick_first_existing_col(df, candidates):
                for c in candidates:
                    if c in df.columns:
                        return c
                for c in candidates:
                    for cc in df.columns:
                        if str(c) in str(cc):
                            return cc
                return None

            m_pref = pick_first_existing_col(mar_df, ["都道府県","pref","県"])
            m_muni = pick_first_existing_col(mar_df, ["市区町村","市町村","muni","市区"])
            m_sex  = pick_first_existing_col(mar_df, ["性別","男女","sex"])
            m_mar  = pick_first_existing_col(mar_df, ["既婚・未婚","婚姻","婚姻状況","marital"])
            m_pop  = pick_first_existing_col(mar_df, ["人口","総数","pop","人数"])
            if all([m_pref,m_muni,m_sex,m_mar,m_pop]):
                mar_df2 = mar_df.copy()
                mar_df2["pref"] = mar_df2[m_pref].astype(str).str.strip()
                mar_df2["muni"] = mar_df2[m_muni].astype(str).str.strip()
                mar_df2["sex_norm"] = mar_df2[m_sex].map(norm_sex)
                mar_df2["mar_norm"] = mar_df2[m_mar].map(norm_marital)
                mar_df2["key"] = mar_df2.apply(lambda r: muni_key(r["pref"], r["muni"]), axis=1)
                mar_df2["pop_mar"] = mar_df2[m_pop].map(safe_int)

                if PREFS:
                    mar_df2 = mar_df2[mar_df2["pref"].isin(PREFS)].copy()

                sex_targets = {"計"} if sex_spec is None else set([norm_sex(x) for x in sex_spec])
                mar_targets = set([norm_marital(x) for x in mar_spec])
                mar_df2 = mar_df2[mar_df2["sex_norm"].isin(sex_targets) & mar_df2["mar_norm"].isin(mar_targets)].copy()
                mar_by_key = mar_df2.groupby("key", as_index=False)["pop_mar"].sum()

                if age_spec is not None:
                    with out:
                        print("⚠️ 婚姻を分母に反映：年齢×婚姻クロス人口が無いと厳密にはできません。今回は婚姻人口で分母を置換します（年齢は分子側優先の扱いになります）。")

                pop2b = pop2b.drop(columns=["pop_num"], errors="ignore").merge(mar_by_key, on="key", how="left")
                pop2b["pop_num"] = pop2b["pop_mar"]
                pop2b = pop2b.drop(columns=["pop_mar"], errors="ignore")
                marital_note = "分母にも反映（婚姻人口表）"
            else:
                with out:
                    print("⚠️ MARITAL_POP_XLSX の列が想定と合いません。婚姻は分子のみで続行します。")

    base_pop = pop2b[["key","pref","muni","pop_num"]].copy()
    base_pop = base_pop.dropna(subset=["key"])
    base_pop["pop_num"] = pd.to_numeric(base_pop["pop_num"], errors="coerce")
    base_pop = base_pop.dropna(subset=["pop_num"])
    base_pop = base_pop[base_pop["pop_num"] > 0].copy()
    base_pop = base_pop.groupby(["key","pref","muni"], as_index=False)["pop_num"].sum()

    with out:
        print(f"✅ 分母条件: 性別={sex_note} / 年齢={age_note} / 婚姻={marital_note}")

    # -------- rate / z / 4分位 --------
    base = base_pop.copy()
    base["count"] = base["key"].map(lambda k: muni_counts.get(k, 0)).astype(int)
    base["rate"] = base["count"] / base["pop_num"]
    base["rate_pct"] = base["rate"] * 100.0
    base["rate_per10k"] = base["rate"] * 10000.0

    pos = base[base["count"] > 0].copy()
    if len(pos) == 0:
        raise RuntimeError("範囲内に『来場>0』の市区町村が1件もありません。条件を緩めてください。")

    z_src = pd.to_numeric(pos[Z_TARGET_COL], errors="coerce")
    mu = float(z_src.mean())
    sd = float(z_src.std(ddof=1))
    pos["z_pos"] = 0.0 if (sd == 0 or np.isnan(sd)) else (z_src - mu) / sd

    zv = pd.to_numeric(pos["z_pos"], errors="coerce")
    q1, q2, q3 = float(zv.quantile(0.25)), float(zv.quantile(0.50)), float(zv.quantile(0.75))
    use_rank = (q1 == q2) or (q2 == q3) or (q1 == q3)
    if use_rank:
        pr = zv.rank(method="average", pct=True)
        pos["quart"] = np.select([pr <= 0.25, pr <= 0.50, pr <= 0.75],[1,2,3],default=4).astype(int)
        with out:
            print("⚠️ zの同値が多いため、順位パーセンタイルで4分位を作成しました。")
    else:
        pos["quart"] = np.select([zv <= q1, zv <= q2, zv <= q3],[1,2,3],default=4).astype(int)

    with out:
        print("✅ 分位境界:", {"Q1(25%)": q1, "Q2(50%)": q2, "Q3(75%)": q3})
        print("✅ 4分位件数:", pos["quart"].value_counts().sort_index().to_dict())

    # -------- 出力（CSV/XLSX） --------
    pos_out = pos[["key","pref","muni","count","pop_num","rate","rate_pct","rate_per10k","z_pos","quart"]].copy()
    jp_map = {
        "key":"自治体キー（都道府県+市区町村）",
        "pref":"都道府県",
        "muni":"市区町村",
        "count":"来場数（郵便番号件数）",
        "pop_num":"人口（分母条件反映）",
        "rate":"集客率（来場数÷人口）",
        "rate_pct":"集客率（%）",
        "rate_per10k":"1万人あたり来場数",
        "z_pos":f"Z値（来場>0基準/対象={Z_TARGET_COL}）",
        "quart":"Zの4分位（1=赤,2=黄,3=青,4=灰）"
    }

    out_csv_jp = f"{OUT_PREFIX}_集客率_集計_日本語_{scope_name}_{cond_tag}.csv"
    pos_out.rename(columns=jp_map).to_csv(out_csv_jp, index=False, encoding="utf-8-sig")

    stats_df = pd.DataFrame([
        stats_series(base["rate"], f"集客率(rate)（{scope_name}：来場0含む／分母条件反映）"),
        stats_series(base["rate_per10k"], f"1万人あたり（{scope_name}：来場0含む／分母条件反映）"),
        stats_series(pos["rate"], f"集客率(rate)（{scope_name}：来場>0のみ／分母条件反映）"),
        stats_series(pos["rate_per10k"], f"1万人あたり（{scope_name}：来場>0のみ／分母条件反映）"),
        stats_series(pos["z_pos"], f"Z値（{scope_name}：来場>0基準/対象={Z_TARGET_COL}）"),
    ])

    out_xlsx = f"{OUT_PREFIX}_統計量_{scope_name}_{cond_tag}.xlsx"
    with pd.ExcelWriter(out_xlsx, engine="openpyxl") as w:
        pos_out.rename(columns=jp_map).to_excel(w, sheet_name=f"{scope_name}_来場>0", index=False)
        stats_df.to_excel(w, sheet_name="統計量", index=False)

    with out:
        print("✅ 出力:", out_csv_jp)
        print("✅ 出力:", out_xlsx)

    # -------- 代表点（N03） --------
    target_keys = set(pos["key"].dropna().tolist())
    key_to_point = {}
    with out:
        print("… N03から代表点抽出中（必要キーのみ）")

    with open(N03_GEOJSON, "r", encoding="utf-8") as f:
        for feat in ijson.items(f, "features.item"):
            props = feat.get("properties", {})
            pref = props.get("N03_001")
            if PREFS and (pref not in PREFS):
                continue
            n03_004 = props.get("N03_004") or ""
            n03_005 = props.get("N03_005") or ""
            muni = (str(n03_004) + str(n03_005)).strip()
            k = muni_key(pref, muni)
            if (k in target_keys) and (k not in key_to_point):
                try:
                    g = shape(feat["geometry"])
                    pt = g.representative_point()
                    key_to_point[k] = (pt.x, pt.y)
                except Exception:
                    pass

    pos2 = pos[pos["key"].isin(key_to_point.keys())].copy()
    with out:
        print(f"✅ 代表点取得: {len(pos2)} / 来場>0: {len(target_keys)}（取れない自治体はスキップ）")
    if len(pos2) == 0:
        raise RuntimeError("代表点が1件も取れませんでした。N03と人口表の市区町村名の揺れを確認してください。")

    # 点CSV
    pts=[]
    for _, r in pos2.iterrows():
        lon, lat = key_to_point[r["key"]]
        nm = f"{r['pref']}{r['muni']}"
        desc = f"来場={int(r['count'])}, 人口={int(r['pop_num'])}, 集客率={r['rate_pct']:.4f}%, 1万人あたり={r['rate_per10k']:.3f}, z={r['z_pos']:.2f}, 4分位={int(r['quart'])}"
        pts.append([nm, desc, lat, lon, r["pref"], r["muni"], int(r["count"]), int(r["pop_num"]), r["rate_pct"], r["rate_per10k"], r["z_pos"], int(r["quart"])])
    df_pts = pd.DataFrame(pts, columns=["名称","説明","緯度","経度","都道府県","市区町村","来場数","人口（分母条件反映）","集客率(%)","1万人あたり","Z値","4分位"])
    out_pts = f"{OUT_PREFIX}_ラベル点_日本語_{scope_name}_{cond_tag}.csv"
    df_pts.to_csv(out_pts, index=False, encoding="utf-8-sig")
    with out:
        print("✅ 出力:", out_pts)

    # -------- ■KMZ --------
    pos2 = pos2.sort_values(["pref","muni"]).reset_index(drop=True)

    # 分割
    parts=[]
    cur=[]
    for _, r in pos2.iterrows():
        cur.append(r)
        if len(cur) >= MAX_PER_FILE:
            parts.append(pd.DataFrame(cur))
            cur=[]
    if cur:
        parts.append(pd.DataFrame(cur))

    with out:
        print(f"✅ KMZ分割数: {len(parts)}（MAX_PER_FILE={MAX_PER_FILE}）")

    def build_kml_text(df_part, part_no):
        header = [
            '<?xml version="1.0" encoding="UTF-8"?>',
            '<kml xmlns="http://www.opengis.net/kml/2.2">',
            '  <Document>',
            f'    <name>{esc_xml(OUT_PREFIX)}_z四分位色分け_四角{SQUARE_SIDE_KM:.2f}km_{esc_xml(scope_name)}_{esc_xml(cond_tag)}_part{part_no:02d}</name>',
            '    <description><![CDATA['
            f'<b>範囲</b>: {scope_name}<br>'
            f'<b>条件</b>: 性別={SEGMENT_FILTERS["性別"]}, 年齢={SEGMENT_FILTERS["年齢"]}(mode={age_mode}), 婚姻={SEGMENT_FILTERS["既婚・未婚"]}<br>'
            '<b>色</b>: Z値を来場>0で標準化→4分位（Q1赤/Q2黄/Q3青/Q4灰）<br>'
            f'四角サイズ：一辺 {SQUARE_SIDE_KM:.2f} km<br>'
            f'分位境界（参考）：Q1={q1:.4f}, Q2={q2:.4f}, Q3={q3:.4f}'
            ']]></description>'
        ]
        body=[]
        for _, r in df_part.iterrows():
            k = r["key"]
            if k not in key_to_point:
                continue
            lon, lat = key_to_point[k]
            q = int(r["quart"])
            col = pick_color_by_quart(q)
            nm = f"{r['pref']}{r['muni']}"
            desc = (
                f"{nm}<br>"
                f"来場:{int(r['count'])} / 人口:{int(r['pop_num'])}<br>"
                f"集客率:{r['rate_pct']:.4f}% / 1万人あたり:{r['rate_per10k']:.3f}<br>"
                f"<b>Z</b>:{float(r['z_pos']):.2f} / <b>4分位</b>:{q}"
            )
            sq = make_square(lon, lat, SQUARE_SIDE_KM)
            coords = " ".join([f"{x:.6f},{y:.6f},0" for x,y in sq])

            body += [
                "    <Placemark>",
                f"      <name>{esc_xml(nm)}</name>",
                f"      <description><![CDATA[{desc}]]></description>",
                "      <Style>",
                "        <LineStyle>",
                f"          <color>{LINE_COLOR}</color><width>1</width>",
                "        </LineStyle>",
                "        <PolyStyle>",
                f"          <color>{col}</color><fill>1</fill><outline>1</outline>",
                "        </PolyStyle>",
                "      </Style>",
                "      <Polygon><outerBoundaryIs><LinearRing>",
                f"        <coordinates>{coords}</coordinates>",
                "      </LinearRing></outerBoundaryIs></Polygon>",
                "    </Placemark>",
            ]

        footer = ["  </Document>", "</kml>"]
        return "\n".join(header + body + footer)

    out_kmz_list=[]
    for i, df_part in enumerate(parts, start=1):
        kml_text = build_kml_text(df_part, i)
        kmz_name = f"{OUT_PREFIX}_z四分位色分け_四角{SQUARE_SIDE_KM:.2f}km_{scope_name}_{cond_tag}_part{i:02d}.kmz"
        with zipfile.ZipFile(kmz_name, "w", compression=zipfile.ZIP_DEFLATED) as zf:
            zf.writestr("doc.kml", kml_text.encode("utf-8"))
        out_kmz_list.append(kmz_name)
        with out:
            print("✅ 出力:", kmz_name, f"（件数={len(df_part)}）")

    # ---- Colabダウンロード ----
    try:
        from google.colab import files
        for f in [out_csv_jp, out_xlsx, out_pts] + out_kmz_list:
            files.download(f)
    except Exception:
        pass

    with out:
        print("\n✅ 完了")

btn_run.on_click(run_pipeline)

ui = widgets.VBox([
    widgets.HTML("<h3>条件を複数選択（列内OR / 列間AND）→ 実行</h3>"),
    pref_text,

    widgets.HTML("<hr><b>分母も絞る（優先3属性：性別/年齢/婚姻）</b>"),
    seg_col_dd["性別"], sex_ms,
    seg_col_dd["年齢"], age_mode_label, age_ms,
    seg_col_dd["既婚・未婚"], mar_ms,
    btn_load_seg,

    widgets.HTML("<hr><b>分子だけ絞る（任意：複数列OK / 各列も複数値OK）</b>"),
    other_cols,
    other_values_area,

    widgets.HTML("<hr>"),
    btn_run,
    widgets.HTML("<hr><b>ログ</b>"),
    log,
    widgets.HTML("<hr><b>実行結果</b>"),
    out
])

display(ui)
print("✅ 手順：①候補値抽出 →（必要なら分子のみ列も候補抽出して選択）→②実行")


✅ install
✅ done
✅ 来場CSV: 1 files
✅ unique columns: 12


VBox(children=(HTML(value='<h3>条件を複数選択（列内OR / 列間AND）→ 実行</h3>'), Text(value='', description='PREF_FILTER', lay…

✅ 手順：①候補値抽出 →（必要なら分子のみ列も候補抽出して選択）→②実行


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>