In [None]:
import json
import math
from pathlib import Path

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pydantic_settings import BaseSettings, SettingsConfigDict

In [None]:
class FontSettings(BaseSettings):
    """Load font settings from .env file."""

    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
    font_ja: str
    font_en: str


font_settings = FontSettings()

In [None]:
fp_ja = fm.FontProperties(fname=str(font_settings.font_ja))
fp = fm.FontProperties(fname=str(font_settings.font_en))

In [None]:
def load_population_data(path: Path = Path("data/FEI_CITY_251130201152.csv")) -> pd.DataFrame:
    """Load population data from CSV file."""
    orig = pd.read_csv(path, skiprows=8, header=0)
    # "調査年" 列が "2020年度" の行だけ残す。
    orig = orig[orig["調査年"] == "2020年度"].reset_index(drop=True)
    # "地域" と "A1101_総人口【人】" だけ残す。
    # municipality と population にカラム名を変更する。
    df = orig[["地域", "A1101_総人口【人】"]].copy()
    df.columns = ["municipality", "population"]
    # population を整数型に変換する。
    # カンマ区切りを削除してから変換する。
    # 数字に変換できない場合は 0 にする。
    df["population"] = (
        pd.to_numeric(df["population"].str.replace(",", ""), errors="coerce").fillna(0).astype(int)
    )
    return df


def remove_wards_without_tokyo(df: pd.DataFrame) -> pd.DataFrame:
    """Remove wards that are not part of Tokyo."""
    # 区を全部削除する。
    # ただし、東京都の区は残す。
    is_ward = df["municipality"].str.contains("区")
    is_tokyo = df["municipality"].str.contains("東京都")
    return df[~is_ward | is_tokyo].reset_index(drop=True)


def split_municipality_column(df: pd.DataFrame) -> pd.DataFrame:
    """Add a prefecture column to the DataFrame."""
    df = df.copy()
    # 都道府県名を抽出して、新しいカラム "prefecture" に追加する。
    # municipality 列の先頭から最初の空白までを抽出する。
    # その後、municipality 列から抽出した部分を削除する。
    df["prefecture"] = df["municipality"].str.split(" ").str[0]
    df["municipality"] = df["municipality"].str.split(" ").str[1:].str.join(" ")
    # カラムの順番を変更する。
    return df[["prefecture", "municipality", "population"]]

In [None]:
data = split_municipality_column(remove_wards_without_tokyo(load_population_data()))

In [None]:
prefectores = data["prefecture"].unique().tolist()

In [None]:
def extract_population_by_prefecture(df: pd.DataFrame, prefecture: str) -> pd.DataFrame:
    """Extract population data for a specific prefecture."""
    return df[df["prefecture"] == prefecture].reset_index(drop=True)


def sort_by_population(df: pd.DataFrame) -> pd.DataFrame:
    """Sort the DataFrame by population in descending order."""
    return df.sort_values(by="population", ascending=False).reset_index(drop=True)


# 都道府県ごとに、市区町村の人口割合を計算して列を追加する。
def add_population_ratio_by_prefecture(df: pd.DataFrame) -> pd.DataFrame:
    """Add a column for population ratio by prefecture."""
    df = df.copy()
    # 都道府県ごとの総人口を計算する。
    total_population_by_prefecture = df.groupby("prefecture")["population"].transform("sum")
    # 人口割合を計算して、新しいカラム "population_ratio" に追加する。
    df["population_ratio"] = df["population"] / total_population_by_prefecture
    return df

In [None]:
(10 ** (int(math.log10(1234)) - 2))

In [None]:
12345 // (10 ** (int(math.log10(1234)) - 2)) * 100

In [None]:
# 県ごとに、上位 5 市町村の人口割合が多い県と少ない県を表示する
def plot_top5_population_ratio_by_prefecture(df: pd.DataFrame) -> None:
    """Plot the top 5 population ratio by prefecture."""
    prefectures = df["prefecture"].unique().tolist()
    top5_ratios = {}
    for prefecture in prefectures:
        prefecture_df = extract_population_by_prefecture(df, prefecture)
        prefecture_df = add_population_ratio_by_prefecture(prefecture_df)
        sorted_df = sort_by_population(prefecture_df)
        top5 = sorted_df.head(5)["population_ratio"].sum()
        top5_ratios[prefecture] = top5
    sorted_top5 = dict(sorted(top5_ratios.items(), key=lambda item: item[1], reverse=True))

    plt.figure(figsize=(10, 6))
    plt.bar(sorted_top5.keys(), sorted_top5.values())
    # Noto Sans JP フォントを使用して日本語を表示する。
    # fonts/NotoSansJP-Regular.ttf を利用。
    plt.xticks(rotation=90, fontproperties=fp_ja)
    plt.xlabel("Prefecture")
    plt.ylabel("Top 5 Population Ratio")
    plt.title("Top 5 Population Ratio by Prefecture")
    plt.tight_layout()
    plt.savefig("figs/top5_population_ratio_by_prefecture.png")
    plt.show()


plot_top5_population_ratio_by_prefecture(data)

In [None]:
THOUSAND = 1_000
MILLION = 1_000_000
BILLION = 1_000_000_000
TRILLION = 1_000_000_000_000


def format_kmb(n: int) -> str:
    """Format number with k, M, B, T suffixes."""
    x = n
    if x == 0:
        return "0"
    if x < THOUSAND:
        return str(x)

    if x >= TRILLION:
        unit, suffix = TRILLION, "T"
    elif x >= BILLION:
        unit, suffix = BILLION, "B"
    elif x >= MILLION:
        unit, suffix = MILLION, "M"
    else:
        unit, suffix = THOUSAND, "k"

    v = x / unit
    decimals = 2 if v < 10 else (1 if v < 100 else 0)  # noqa: PLR2004

    factor = 10**decimals
    v = math.trunc(v * factor) / factor

    s = str(int(v)) if decimals == 0 else f"{v:.{decimals}f}"
    return f"{s}{suffix}"

In [None]:
for idx, prefecture in enumerate(prefectores):
    data_prefecture = add_population_ratio_by_prefecture(
        sort_by_population(extract_population_by_prefecture(data, prefecture))
    )

    # 上位10市町村
    labels = [*data_prefecture["municipality"][:10], "その他"]
    ratios = [
        *data_prefecture["population_ratio"][:10],
        data_prefecture["population_ratio"][10:].sum(),
    ]
    populations = [*data_prefecture["population"][:10], data_prefecture["population"][10:].sum()]
    populations_text = [format_kmb(p) for p in populations]
    label_texts = [
        f"{population}\n{ratio:.1%}"
        for population, ratio in zip(populations_text, ratios, strict=True)
    ]

    plt.pie(
        populations,
        labels=[f"{ratio:.1%}" for ratio in ratios],
        colors=["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "lightgray"],
        startangle=90,
        counterclock=False,
        textprops={"fontproperties": fp},
    )
    plt.savefig(f"figs/pie_{idx:02d}_{prefecture}.png", dpi=300)

    plt.clf()
    plt.pie(
        populations,
        labels=label_texts,
        colors=["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "lightgray"],
        startangle=90,
        counterclock=False,
        textprops={"fontproperties": fp},
    )
    plt.title(f"Total: {format_kmb(sum(populations))}", fontproperties=fp)
    plt.savefig(f"figs/pie_label_{idx:02d}_{prefecture}.png", dpi=300)
    plt.clf()

In [None]:
# Save as JSON file
max_municipalities = 10
populations = np.zeros((len(prefectores), max_municipalities + 1), dtype=np.int32)
ratios = np.zeros((len(prefectores), max_municipalities + 1), dtype=np.float64)
municipalities = np.zeros((len(prefectores), max_municipalities), dtype=object)
for idx, prefecture in enumerate(prefectores):
    data_prefecture = add_population_ratio_by_prefecture(
        sort_by_population(extract_population_by_prefecture(data, prefecture))
    )

    municipalities[idx, :max_municipalities] = data_prefecture["municipality"][:max_municipalities]
    populations[idx, :max_municipalities] = data_prefecture["population"][:max_municipalities]
    ratios[idx, :max_municipalities] = data_prefecture["population_ratio"][:max_municipalities]
    populations[idx, max_municipalities] = data_prefecture["population"][max_municipalities:].sum()
    ratios[idx, max_municipalities] = data_prefecture["population_ratio"][max_municipalities:].sum()

In [None]:
# Distance
distance_population = np.linalg.norm(
    populations[None, :, :max_municipalities] - populations[:, None, :max_municipalities], axis=-1
)
# Weighted Distance
distance_ratio = np.linalg.norm(
    (ratios[None, :, :max_municipalities] - ratios[:, None, :max_municipalities])
    * [3, 3, 2, 1, 1, 1, 1, 1, 1, 1],
    axis=-1,
)

In [None]:
municipalities

In [None]:
Path("data/data.json").write_text(
    json.dumps(
        {
            "prefectores": prefectores,
            "municipalities": municipalities.tolist(),
            "populations": populations.tolist(),
            "ratios": ratios.tolist(),
            "distance_population": distance_population.tolist(),
            "distance_ratio": distance_ratio.tolist(),
        },
        indent=2,
        ensure_ascii=False,
    )
    + "\n",
    encoding="utf-8",
)