In [5]:
import requests
import json
import re
from bs4 import BeautifulSoup

url = "https://www.houjin-bangou.nta.go.jp/download/zenken/"

# Dictionary mapping Japanese prefecture names to English
pref = {
    "全国": "All",
    "北海道": "Hokkaido",
    "青森県": "Aomori",
    "岩手県": "Iwate",
    "宮城県": "Miyagi",
    "秋田県": "Akita",
    "山形県": "Yamagata",
    "福島県": "Fukushima",
    "茨城県": "Ibaraki",
    "栃木県": "Tochigi",
    "群馬県": "Gunma",
    "埼玉県": "Saitama",
    "千葉県": "Chiba",
    "東京都": "Tokyo",
    "神奈川県": "Kanagawa",
    "新潟県": "Niigata",
    "富山県": "Toyama",
    "石川県": "Ishikawa",
    "福井県": "Fukui",
    "山梨県": "Yamanashi",
    "長野県": "Nagano",
    "岐阜県": "Gifu",
    "静岡県": "Shizuoka",
    "愛知県": "Aichi",
    "三重県": "Mie",
    "滋賀県": "Shiga",
    "京都府": "Kyoto",
    "大阪府": "Osaka",
    "兵庫県": "Hyogo",
    "奈良県": "Nara",
    "和歌山県": "Wakayama",
    "鳥取県": "Tottori",
    "島根県": "Shimane",
    "岡山県": "Okayama",
    "広島県": "Hiroshima",
    "山口県": "Yamaguchi",
    "徳島県": "Tokushima",
    "香川県": "Kagawa",
    "愛媛県": "Ehime",
    "高知県": "Kochi",
    "福岡県": "Fukuoka",
    "佐賀県": "Saga",
    "長崎県": "Nagasaki",
    "熊本県": "Kumamoto",
    "大分県": "Oita",
    "宮崎県": "Miyazaki",
    "鹿児島県": "Kagoshima",
    "沖縄県": "Okinawa",
    "国外": "Other"
}

# Fetch the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Locate the table containing the file IDs
unicode_table = soup.find('div', class_='inBox21').find_all('div', class_='tbl02')[1]

# Retrieve all rows from the table
rows = unicode_table.find_all('dl')

# Initialize a dictionary to store region names and corresponding file IDs
region_file_ids = {}

# Process each row to extract the region name and file ID
for row in rows:
    region_name_jp = row.find('dt', class_='mb05').text.strip()
    region_name = pref.get(region_name_jp, region_name_jp)  # Convert Japanese to English if possible
    file_id = re.search(r'\d{5}', row.find('a').get('onclick')).group()  # Extract file ID using regex
    region_file_ids[region_name] = file_id

# Output the dictionary containing region names and file IDs
with open('file_id.json', 'w') as file:
    json.dump(region_file_ids, file, ensure_ascii=False, indent=4)
