# 集客率改良（会場別・新規作成版）

`AJ-*.csv` を **会場ごとに個別集計** し、以下を出力します。

1. 会場別サマリー（平均・中央値など）
2. 会場ごとの市区町村明細
3. 全会場統計量（会場単位の指標を対象）
4. 会場別ヒートマップHTML

> 備考: `市区町村` 列がないAJファイルは `郵便番号` → `utf_ken_all.csv` で補完します。


In [None]:
# Colab利用時のみ必要
!pip -q install pandas numpy openpyxl folium chardet


In [None]:
import glob
import json
import re
from pathlib import Path

import chardet
import folium
import numpy as np
import pandas as pd


In [None]:
# ===== 設定 =====
VISITOR_GLOB = "AJ-*.csv"
KEN_ALL_CSV = "utf_ken_all.csv"
POP_XLSX = "【総計】市区町村別年齢階級別人口(2025.8).xlsx"
POP_SHEET = 0
GEOJSON_PATH = "N03-20240101.geojson"
OUT_DIR = Path("out_venue")
OUT_DIR.mkdir(exist_ok=True)

# AJ列候補
AJ_MUNI_COLS = ["市区町村", "居住地_市区町村", "住所_市区町村", "市区町村名"]
AJ_ZIP_COLS = ["郵便番号", "郵便", "zip", "Zip", "ZIP"]

# 人口列候補（分からなければ自動推定）
POP_MUNI_COLS = ["市区町村", "市区町村名", "自治体名"]
POP_TOTAL_COLS = ["総数", "人口", "総人口", "人口計"]

# 手動指定が必要ならここに列名を入れる（自動判定を上書き）
MANUAL_POP_MUNI_COL = None
MANUAL_POP_TOTAL_COL = None


In [None]:
def detect_encoding(path, nbytes=200_000):
    with open(path, "rb") as f:
        raw = f.read(nbytes)
    return chardet.detect(raw).get("encoding") or "utf-8"


def read_csv_flex(path):
    tried = []
    for enc in [detect_encoding(path), "cp932", "shift_jis", "utf-8-sig", "utf-8"]:
        if enc in tried:
            continue
        tried.append(enc)
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(path)


def normalize_muni(series):
    return (series.astype(str)
            .str.strip()
            .str.replace(r"\s+", "", regex=True)
            .str.replace("ヶ", "ケ"))


def normalize_zip(series):
    return (series.astype(str)
            .str.replace(r"[^0-9]", "", regex=True)
            .str.zfill(7)
            .str[:7])


def to_num(series):
    return pd.to_numeric(series.astype(str).str.replace(",", "", regex=False), errors="coerce")


def first_existing_col(df, names):
    for n in names:
        if n in df.columns:
            return n
    return None


def first_keyword_col(df, includes, excludes=None):
    excludes = excludes or []
    for c in df.columns:
        s = str(c)
        if all(k in s for k in includes) and not any(x in s for x in excludes):
            return c
    return None


def extract_venue_name(path):
    m = re.match(r"AJ-(.*)\.csv$", Path(path).name)
    return m.group(1) if m else Path(path).stem


In [None]:
def build_zip_to_muni_map(ken_all_csv):
    ken = pd.read_csv(ken_all_csv, header=None, dtype=str, encoding="utf-8-sig")
    z = normalize_zip(ken.iloc[:, 2])
    muni = normalize_muni(ken.iloc[:, 6].fillna("") + ken.iloc[:, 7].fillna(""))
    m = pd.DataFrame({"zip": z, "muni": muni})
    m = m[(m["zip"].str.len() == 7) & (m["muni"] != "")]
    return dict(zip(m["zip"], m["muni"]))


def detect_population_columns(pop):
    cols = [str(c).strip() for c in pop.columns]
    pop.columns = cols

    muni_col = MANUAL_POP_MUNI_COL if MANUAL_POP_MUNI_COL in cols else None
    total_col = MANUAL_POP_TOTAL_COL if MANUAL_POP_TOTAL_COL in cols else None

    if muni_col is None:
        muni_col = first_existing_col(pop, POP_MUNI_COLS) or first_keyword_col(pop, ["市区町村"], ["コード"])

    if total_col is None:
        total_col = (
            first_existing_col(pop, POP_TOTAL_COLS)
            or first_keyword_col(pop, ["総人口"])
            or first_keyword_col(pop, ["人口", "総数"])
            or first_keyword_col(pop, ["人口計"])
            or first_keyword_col(pop, ["人口"], ["率", "コード"])
        )

    if muni_col is None:
        obj_cols = [c for c in cols if pop[c].dtype == "object"]
        muni_col = obj_cols[0] if obj_cols else cols[0]

    if total_col is None:
        numeric_rank = sorted(
            [(c, to_num(pop[c]).notna().sum()) for c in cols if c != muni_col],
            key=lambda x: x[1],
            reverse=True,
        )
        total_col = numeric_rank[0][0] if numeric_rank else None

    return muni_col, total_col


def load_population_df(path, sheet=0):
    debug = []
    for header in [0, 1, 2, 3]:
        try:
            pop = pd.read_excel(path, sheet_name=sheet, header=header)
        except Exception as e:
            debug.append({"header": header, "error": str(e)})
            continue

        muni_col, total_col = detect_population_columns(pop)
        if not muni_col or not total_col:
            debug.append({"header": header, "columns": list(pop.columns), "reason": "col_not_found"})
            continue

        out = pop[[muni_col, total_col]].copy()
        out.columns = ["市区町村", "人口"]
        out["市区町村"] = normalize_muni(out["市区町村"])
        out["人口"] = to_num(out["人口"])
        out = out.dropna(subset=["市区町村", "人口"])
        out = out[out["市区町村"] != ""]

        if len(out) > 0:
            return out.groupby("市区町村", as_index=False)["人口"].sum(), {
                "header": header,
                "muni_col": muni_col,
                "total_col": total_col,
            }

        debug.append({"header": header, "reason": "empty_after_clean"})

    raise KeyError(f"人口列の自動判定に失敗しました。設定セルで手動指定してください。debug={debug[:3]}")


In [None]:
# ===== 会場別集計 =====
pop_df, pop_meta = load_population_df(POP_XLSX, POP_SHEET)
print("人口列判定:", pop_meta)

zip_to_muni = build_zip_to_muni_map(KEN_ALL_CSV)
paths = sorted(glob.glob(VISITOR_GLOB))
if not paths:
    raise FileNotFoundError(f"{VISITOR_GLOB} が見つかりません")

venue_tables = {}
venue_rows = []

for p in paths:
    venue = extract_venue_name(p)
    aj = read_csv_flex(p)

    muni_col = first_existing_col(aj, AJ_MUNI_COLS)
    zip_col = first_existing_col(aj, AJ_ZIP_COLS)

    if muni_col:
        muni = normalize_muni(aj[muni_col])
    elif zip_col:
        muni = normalize_zip(aj[zip_col]).map(zip_to_muni)
    else:
        raise KeyError(f"{Path(p).name}: 市区町村列/郵便番号列がありません。columns={list(aj.columns)[:30]}")

    tmp = pd.DataFrame({"市区町村": muni})
    tmp = tmp[tmp["市区町村"].notna() & (tmp["市区町村"] != "")]
    vc = tmp.groupby("市区町村", as_index=False).size().rename(columns={"size": "来場者数"})

    merged = vc.merge(pop_df, on="市区町村", how="left")
    merged["集客率(%)"] = merged["来場者数"] / merged["人口"] * 100
    merged["集客率(1万人あたり)"] = merged["来場者数"] / merged["人口"] * 10000
    merged = merged.sort_values("集客率(1万人あたり)", ascending=False)

    venue_tables[venue] = merged

    rates = merged["集客率(1万人あたり)"].replace([np.inf, -np.inf], np.nan).dropna()
    venue_rows.append({
        "会場": venue,
        "対象CSV": Path(p).name,
        "市区町村数": int(len(merged)),
        "総来場者数": int(merged["来場者数"].sum()),
        "平均集客率(1万人あたり)": float(rates.mean()) if len(rates) else np.nan,
        "中央値集客率(1万人あたり)": float(rates.median()) if len(rates) else np.nan,
        "最大集客率(1万人あたり)": float(rates.max()) if len(rates) else np.nan,
        "最小集客率(1万人あたり)": float(rates.min()) if len(rates) else np.nan,
    })

venue_summary = pd.DataFrame(venue_rows).sort_values("会場")
venue_summary.to_csv(OUT_DIR / "会場別サマリー.csv", index=False, encoding="utf-8-sig")

with pd.ExcelWriter(OUT_DIR / "会場別_市区町村明細.xlsx", engine="openpyxl") as w:
    for venue, table in venue_tables.items():
        table.to_excel(w, index=False, sheet_name=(venue[:31] if venue else "venue"))

base = venue_summary["平均集客率(1万人あたり)"]
stats = pd.DataFrame([{
    "会場数": int(len(base)),
    "平均": float(base.mean()),
    "中央値": float(base.median()),
    "標準偏差": float(base.std(ddof=1)),
    "最小": float(base.min()),
    "最大": float(base.max()),
    "25%点": float(base.quantile(0.25)),
    "75%点": float(base.quantile(0.75)),
}])
stats.to_csv(OUT_DIR / "全会場_統計量.csv", index=False, encoding="utf-8-sig")

venue_summary.head(), stats


In [None]:
# ===== 会場別マップ =====
with open(GEOJSON_PATH, "r", encoding="utf-8") as f:
    gj = json.load(f)

def geo_muni(feat):
    props = feat.get("properties", {})
    for k in ["N03_004", "市区町村", "name", "NAME"]:
        if k in props and props[k]:
            return str(props[k])
    return None

for venue, table in venue_tables.items():
    rmap = dict(zip(table["市区町村"], table["集客率(1万人あたり)"]))
    vals = pd.Series(list(rmap.values())).replace([np.inf, -np.inf], np.nan).dropna()
    vmin, vmax = (vals.min(), vals.max()) if len(vals) else (0, 1)
    span = (vmax - vmin) if vmax > vmin else 1.0

    m = folium.Map(location=[35.68, 139.76], zoom_start=5, tiles="cartodbpositron")

    def style_fn(feature):
        muni = normalize_muni(pd.Series([geo_muni(feature)])).iloc[0]
        v = rmap.get(muni, np.nan)
        if pd.isna(v):
            return {"fillColor":"#dddddd", "color":"#999999", "weight":0.4, "fillOpacity":0.2}
        t = (v - vmin) / span
        color = f"#{int(255*t):02x}40{int(255*(1-t)):02x}"
        return {"fillColor":color, "color":"#666666", "weight":0.4, "fillOpacity":0.7}

    folium.GeoJson(gj, style_function=style_fn).add_to(m)
    m.save(OUT_DIR / f"map_{venue}.html")

print("出力先:", OUT_DIR.resolve())
