In [None]:
# ライブラリの読み込み
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# ファイル読み込み（Data/ に入ってる前提）
income_df = pd.read_csv("Data/income_district.csv")

# 国勢調査系のCSVを全部読み込む（リストにまとめて）
file_names = [
    "h27_age_df.csv", "h27_family_df.csv", "h27_gender_df.csv",
    "h27_house_df.csv", "h27_house_info_df.csv", "h27_industry_df.csv",
    "h27_job_df.csv", "h27_labor_df.csv", "h27_marriage_df.csv", "h27_work_status_df.csv"
]

# 読み込み＆マージ
dfs = [pd.read_csv(f"Data/{file}") for file in file_names]

# すべてdistrict_codeでマージ
from functools import reduce
census_merged = reduce(lambda left, right: pd.merge(left, right, on="district_code"), dfs)

# 最後に収入データをマージ
full_df = pd.merge(income_df, census_merged, on="district_code")

# 不要な列（地区名など）削除
full_df = full_df.drop(columns=["district_name"], errors="ignore")

# 欠損があれば削除
full_df = full_df.dropna()

# 説明変数と目的変数に分割
X = full_df.drop(columns=["income", "district_code"])
y = full_df["income"]

# 相関ヒートマップでざっくり確認
plt.figure(figsize=(10, 8))
sns.heatmap(full_df.corr(numeric_only=True), cmap="coolwarm", vmax=1.0, vmin=-1.0)
plt.title("収入との相関ヒートマップ")
plt.show()

# 学習と評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"平均二乗誤差（MSE）：{mse:.2f}")
