In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re # Import the re module for regular expressions

def scrape_apj_museums():
    url = "https://artplatform.go.jp/museums"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    print(f"Connecting to {url}...")

    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching page: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    museum_list = []

    # The main container for the list
    main_content = soup.find('main') or soup.find('body')

    # List of all 47 prefectures in Japan
    prefectures_list = [
        "Hokkaido", "Aomori", "Iwate", "Miyagi", "Akita", "Yamagata", "Fukushima",
        "Ibaraki", "Tochigi", "Gunma", "Saitama", "Chiba", "Tokyo", "Kanagawa",
        "Niigata", "Toyama", "Ishikawa", "Fukui", "Yamanashi", "Nagano", "Gifu",
        "Shizuoka", "Aichi", "Mie", "Shiga", "Kyoto", "Osaka", "Hyogo", "Nara",
        "Wakayama", "Tottori", "Shimane", "Okayama", "Hiroshima", "Yamaguchi",
        "Tokushima", "Kagawa", "Ehime", "Kochi", "Fukuoka", "Saga", "Nagasaki",
        "Kumamoto", "Oita", "Miyazaki", "Kagoshima", "Okinawa"
    ]

    current_prefecture = "Unknown"

    # Iterate through elements, looking for headers that contain prefecture names.
    # The site often uses 'div' or 'h4' with text like "Tokyo 73"
    # We'll also consider 'a' tags for museum links.
    elements_to_process = main_content.find_all(['h3', 'h4', 'div', 'span', 'a'])

    for element in elements_to_process:
        text = element.get_text().strip()
        href = element.get('href', '')

        is_prefecture_header = False
        # Check if the element text contains a prefecture name, potentially with a count
        for pref in prefectures_list:
            # Regex to match 'PrefectureName' as a whole word, optionally followed by spaces and numbers
            # e.g., "Hokkaido", "Hokkaido 223", "Tokyo (73)"
            if re.search(r'\b' + re.escape(pref) + r'(?:\s*\(\d+\)|\s*\d+)?\b', text, re.IGNORECASE):
                current_prefecture = pref
                is_prefecture_header = True
                break

        if is_prefecture_header:
            continue # This element was a prefecture header, move to the next.

        # Check if this element is a museum link (and ensure it's an <a> tag)
        if "/museums/M" in href and element.name == 'a':
            museum_id = href.split('/')[-1]

            # Extract names - structured as EN Name | JP Name
            name_text = element.get_text(separator="|").strip()
            names = [n.strip() for n in name_text.split("|") if n.strip()]

            name_en = names[0] if len(names) > 0 else "N/A"
            name_jp = names[1] if len(names) > 1 else "N/A"

            # Skip elements that are just numbers (the count labels)
            if name_en.isdigit():
                continue

            museum_list.append({
                "apj_id": museum_id,
                "name_en": name_en,
                "name_jp": name_jp,
                "prefecture": current_prefecture,
                "url": f"https://artplatform.go.jp{href}"
            })

    if not museum_list:
        print("No museums found. The page structure might have changed.")
        return None

    df = pd.DataFrame(museum_list)

    # Deduplicate entries
    df = df.drop_duplicates(subset=['apj_id'])

    # Final check: Remove any artifacts that aren't actually museums (e.g., regional total counts)
    df = df[~df['name_en'].str.contains('^[0-9]+$', na=False)]

    print(f"Successfully extracted {len(df)} art museums across Japan.")
    return df

if __name__ == "__main__":
    df_museums = scrape_apj_museums()

    if df_museums is not None:
        cols = ['apj_id', 'name_en', 'name_jp', 'prefecture', 'url']
        df_museums = df_museums[cols]

        filename = "japan_art_museums_fixed.csv"
        df_museums.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"File saved as {filename}")

        print("\nMuseums per Prefecture (Top 10):")
        print(df_museums['prefecture'].value_counts().head(10))

        print("\nSample Data:")
        print(df_museums.head())

Connecting to https://artplatform.go.jp/museums...
Successfully extracted 171 art museums across Japan.
File saved as japan_art_museums_fixed.csv

Museums per Prefecture (Top 10):
prefecture
Tokyo        21
Aichi        13
Shimane      12
Chiba        12
Fukui        11
Hokkaido      9
Osaka         8
Gunma         7
Yamaguchi     7
Ibaraki       7
Name: count, dtype: int64

Sample Data:
    apj_id                                            name_en  \
0  M100001                Kanda Nissho Memorial Museum of Art   
1  M100003                         Kushiro City Museum of Art   
2  M100004                                 Sapporo Art Museum   
3  M100005                           Otaru City Museum of Art   
4  M100006  Asahikawa Museum of Sculpture in Honor of Teij...   

           name_jp prefecture                                        url  
0        神田日勝記念美術館   Hokkaido  https://artplatform.go.jp/museums/M100001  
1          釧路市立美術館   Hokkaido  https://artplatform.go.jp/museums/M10

In [7]:
print("Museums per Prefecture:")
print(df_museums['prefecture'].value_counts())

Museums per Prefecture:
prefecture
Tokyo        21
Aichi        13
Shimane      12
Chiba        12
Fukui        11
Hokkaido      9
Osaka         8
Gunma         7
Yamaguchi     7
Ibaraki       7
Miyagi        5
Hiroshima     5
Niigata       4
Tochigi       4
Tokushima     4
Ehime         3
Shizuoka      3
Fukushima     3
Kochi         3
Wakayama      3
Ishikawa      3
Iwate         2
Oita          2
Yamagata      2
Hyogo         2
Gifu          2
Nara          2
Kumamoto      2
Akita         1
Saitama       1
Mie           1
Yamanashi     1
Kyoto         1
Shiga         1
Nagasaki      1
Fukuoka       1
Miyazaki      1
Kagoshima     1
Name: count, dtype: int64


In [3]:
df_museums.to_csv('exported_museums.csv', index=False, encoding='utf-8-sig')
print("DataFrame exported to 'exported_museums.csv'")

DataFrame exported to 'exported_museums.csv'
