# 集客率改良（会場別版）

`AJ-*.csv` を **ファイルごと（=会場ごと）** に集計し、会場別の集客率表・会場別マップ・全会場統計量（平均/中央値など）を作るノートブックです。

※ `市区町村` 列が無いCSVでも、`郵便番号` 列があれば `utf_ken_all.csv` から市区町村を補完します。
※ マージ競合が起きた場合は、このセルの競合マーカー（`<<<<<<<` / `=======` / `>>>>>>>`）を削除してから実行してください。

In [None]:
# 必要パッケージ（Colab想定）
!pip -q install pandas numpy openpyxl folium chardet

import glob
import json
import re
from pathlib import Path

import chardet
import folium
import numpy as np
import pandas as pd


In [None]:
# ========= 設定 =========
VISITOR_GLOB = "AJ-*.csv"
KEN_ALL_CSV = "utf_ken_all.csv"  # 郵便番号->市区町村補完に使用
POP_XLSX = "【総計】市区町村別年齢階級別人口(2025.8).xlsx"
POP_SHEET = 0
GEOJSON_PATH = "N03-20240101.geojson"

MUNI_COL_CANDIDATES = ["市区町村", "居住地_市区町村", "住所_市区町村", "市区町村名"]
ZIP_COL_CANDIDATES = ["郵便番号", "郵便", "zip", "Zip", "ZIP"]

POP_MUNI_COL_CANDIDATES = ["市区町村", "市区町村名"]
POP_TOTAL_COL_CANDIDATES = ["総数", "人口", "総人口"]

OUT_DIR = Path("out_venue")
OUT_DIR.mkdir(exist_ok=True)


In [None]:
def detect_encoding(path, nbytes=200_000):
    with open(path, "rb") as f:
        raw = f.read(nbytes)
    return chardet.detect(raw).get("encoding") or "utf-8"


def read_csv_flex(path):
    tried = []
    for enc in [detect_encoding(path), "cp932", "shift_jis", "utf-8-sig", "utf-8"]:
        if enc in tried:
            continue
        tried.append(enc)
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(path)


def first_existing_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None


def normalize_muni_name(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.strip()
         .str.replace(r"\s+", "", regex=True)
         .str.replace("ヶ", "ケ")
    )


def normalize_zip(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.replace(r"[^0-9]", "", regex=True)
         .str.zfill(7)
         .str[:7]
    )


def extract_venue_name(file_path: str) -> str:
    name = Path(file_path).name
    m = re.match(r"AJ-(.*)\.csv$", name)
    return m.group(1) if m else Path(file_path).stem


def build_zip_to_muni_map(ken_all_csv: str):
    ken = pd.read_csv(ken_all_csv, header=None, dtype=str, encoding="utf-8-sig")
    # utf_ken_all.csv: 3列目=郵便番号, 7列目=都道府県名, 8列目=市区町村名
    zip_s = normalize_zip(ken.iloc[:, 2])
    muni = normalize_muni_name(ken.iloc[:, 6].fillna("") + ken.iloc[:, 7].fillna(""))
    m = pd.DataFrame({"zip": zip_s, "市区町村": muni})
    m = m[(m["zip"].str.len() == 7) & (m["市区町村"] != "")]
    return dict(zip(m["zip"], m["市区町村"]))


In [None]:
# 1) 人口データ
pop = pd.read_excel(POP_XLSX, sheet_name=POP_SHEET)
pop_muni_col = first_existing_col(pop, POP_MUNI_COL_CANDIDATES)
pop_total_col = first_existing_col(pop, POP_TOTAL_COL_CANDIDATES)
if pop_muni_col is None or pop_total_col is None:
    raise KeyError(f"人口データの列が見つかりません: muni={pop_muni_col}, total={pop_total_col}")

pop_df = pop[[pop_muni_col, pop_total_col]].copy()
pop_df.columns = ["市区町村", "人口"]
pop_df["市区町村"] = normalize_muni_name(pop_df["市区町村"])
pop_df = pop_df.groupby("市区町村", as_index=False)["人口"].sum()

# 2) 郵便番号マスター（フォールバック用）
zip_to_muni = build_zip_to_muni_map(KEN_ALL_CSV)

# 3) 会場別集計
records = []
venue_detail_tables = {}

csv_paths = sorted(glob.glob(VISITOR_GLOB))
if not csv_paths:
    raise FileNotFoundError(f"{VISITOR_GLOB} が見つかりません")

for p in csv_paths:
    venue = extract_venue_name(p)
    df = read_csv_flex(p)

    muni_col = first_existing_col(df, MUNI_COL_CANDIDATES)
    zip_col = first_existing_col(df, ZIP_COL_CANDIDATES)

    if muni_col:
        tmp = pd.DataFrame({"市区町村": normalize_muni_name(df[muni_col])})
    elif zip_col:
        z = normalize_zip(df[zip_col])
        tmp = pd.DataFrame({"市区町村": z.map(zip_to_muni)})
    else:
        raise KeyError(f"{Path(p).name}: 市区町村列も郵便番号列も見つかりません。columns={list(df.columns)[:30]}")

    tmp = tmp[tmp["市区町村"].notna() & (tmp["市区町村"] != "")]
    visitors = tmp.groupby("市区町村", as_index=False).size().rename(columns={"size": "来場者数"})

    merged = visitors.merge(pop_df, on="市区町村", how="left")
    merged["集客率(%)"] = (merged["来場者数"] / merged["人口"]) * 100
    merged["集客率(1万人あたり)"] = (merged["来場者数"] / merged["人口"]) * 10000
    merged = merged.sort_values("集客率(1万人あたり)", ascending=False)

    venue_detail_tables[venue] = merged
    valid_rates = merged["集客率(1万人あたり)"].replace([np.inf, -np.inf], np.nan).dropna()

    records.append({
        "会場": venue,
        "対象CSV": Path(p).name,
        "市区町村数": int(len(merged)),
        "総来場者数": int(merged["来場者数"].sum()),
        "平均集客率(1万人あたり)": float(valid_rates.mean()) if len(valid_rates) else np.nan,
        "中央値集客率(1万人あたり)": float(valid_rates.median()) if len(valid_rates) else np.nan,
        "最大集客率(1万人あたり)": float(valid_rates.max()) if len(valid_rates) else np.nan,
        "最小集客率(1万人あたり)": float(valid_rates.min()) if len(valid_rates) else np.nan,
    })

venue_summary = pd.DataFrame(records).sort_values("会場")
venue_summary.to_csv(OUT_DIR / "会場別サマリー.csv", index=False, encoding="utf-8-sig")

with pd.ExcelWriter(OUT_DIR / "会場別_市区町村明細.xlsx", engine="openpyxl") as writer:
    for venue, table in venue_detail_tables.items():
        table.to_excel(writer, index=False, sheet_name=(venue[:31] if venue else "venue"))

venue_summary


In [None]:
# 4) 全会場統計量（会場ごとの平均集客率ベース）
base = venue_summary["平均集客率(1万人あたり)"]
stats = {
    "会場数": int(len(base)),
    "平均": float(base.mean()),
    "中央値": float(base.median()),
    "標準偏差": float(base.std(ddof=1)),
    "最小": float(base.min()),
    "最大": float(base.max()),
    "25%点": float(base.quantile(0.25)),
    "75%点": float(base.quantile(0.75)),
}
stats_df = pd.DataFrame([stats])
stats_df.to_csv(OUT_DIR / "全会場_統計量.csv", index=False, encoding="utf-8-sig")
stats_df


In [None]:
# 5) 会場別マップ
with open(GEOJSON_PATH, "r", encoding="utf-8") as f:
    gj = json.load(f)

prop_candidates = ["N03_004", "市区町村", "name", "NAME"]

def geojson_muni_name(feat):
    props = feat.get("properties", {})
    for c in prop_candidates:
        if c in props and props[c]:
            return str(props[c])
    return None

for venue, detail in venue_detail_tables.items():
    rate_map = dict(zip(detail["市区町村"], detail["集客率(1万人あたり)"]))
    m = folium.Map(location=[35.68, 139.76], zoom_start=5, tiles="cartodbpositron")

    vals = pd.Series(list(rate_map.values())).replace([np.inf, -np.inf], np.nan).dropna()
    vmin, vmax = (vals.min(), vals.max()) if len(vals) else (0, 1)
    rng = (vmax - vmin) if vmax > vmin else 1.0

    def style_fn(feature):
        muni = normalize_muni_name(pd.Series([geojson_muni_name(feature)])).iloc[0]
        v = rate_map.get(muni, np.nan)
        if pd.isna(v):
            return {"fillColor": "#dddddd", "color": "#999999", "weight": 0.4, "fillOpacity": 0.2}
        t = (v - vmin) / rng
        color = f"#{int(255*t):02x}40{int(255*(1-t)):02x}"
        return {"fillColor": color, "color": "#666666", "weight": 0.4, "fillOpacity": 0.7}

    folium.GeoJson(gj, style_function=style_fn, name="集客率").add_to(m)
    folium.LayerControl().add_to(m)
    m.save(OUT_DIR / f"map_{venue}.html")

print(f"出力完了: {OUT_DIR.resolve()}")
