## データ収集 (Data Collection)

本ノートブックでは、対象自治体の児童手当に関するWebページを収集します。

### フロー
1. **自動検索 & 候補抽出**: 定義された検索パターンごとに、指定された優先ドメインを持つURLを抽出し、`data/source_urls.json` に保存。
2. **手動確認**: `data/source_urls.json` を確認・修正。
3. **データ保存**: 修正後の `data/source_urls.json` を読み込み、データをダウンロードして保存。

In [11]:
import os
import json
import time
import requests
import hashlib
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
import config

# 設定
DATA_DIR = "../data/raw_text"
HTML_CACHE_DIR = "../data/raw_html"
JSON_PATH = "../data/source_urls.json"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(HTML_CACHE_DIR, exist_ok=True)

ENABLE_GOOGLE_SEARCH = False
GOOGLE_API_KEY = "AIzaSyCfGA5ChqTMDHFrja8CdJM9-GS409Pq8yk"
CUSTOM_SEARCH_ENGINE_ID = "3420ec94d15d34b85"

In [12]:
def get_html_content(url):
    """
    URLからHTMLを取得する。キャッシュがあればそれを使う。
    """
    url_hash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache_path = os.path.join(HTML_CACHE_DIR, f"{url_hash}.html")
    
    if os.path.exists(cache_path):
        print(f"  -> Loading from cache: {cache_path}")
        with open(cache_path, 'r', encoding='utf-8') as f:
            return f.read()
            
    print(f"Fetching: {url}")
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers, timeout=10)
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    html_content = res.text
    
    with open(cache_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    return html_content

def clean_html(soup):
    """
    不要な要素(header, nav, footer, script, style)を削除する
    """
    for element in soup(['header', 'nav', 'footer', 'script', 'style']):
        element.decompose()
    return soup

def convert_tables_to_markdown(soup):
    """
    HTMLのtable要素をMarkdown形式のテキストに変換して置換する
    """
    for table in soup.find_all('table'):
        rows = table.find_all('tr')
        if not rows:
            continue
            
        markdown_table = []
        
        # Header
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]
        if headers:
            markdown_table.append('| ' + ' | '.join(headers) + ' |')
            markdown_table.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
        
        # Body
        for row in rows[1:]:
            cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
            if cols:
                # Adjust column count if necessary (simple handling)
                if len(cols) < len(headers):
                    cols += [''] * (len(headers) - len(cols))
                elif len(cols) > len(headers) and headers:
                     # If header is missing but body has more cols, we might need to adjust header or just ignore mismatch for now
                     pass
                markdown_table.append('| ' + ' | '.join(cols) + ' |')
        
        # Replace table with markdown
        new_tag = soup.new_tag('div')
        new_tag.string = '\n' + '\n'.join(markdown_table) + '\n'
        table.replace_with(new_tag)
    
    return soup

In [13]:
# 検索パターンと優先ドメイン
SEARCH_CONFIGS = [
    {
        "pattern_suffix": "児童手当 認定請求 site:app.oss.myna.go.jp",
        "trusted_domains": ["app.oss.myna.go.jp"],
        "file_suffix": "digital"
    },
    {
        "pattern_suffix": "児童手当 認定請求",
        "trusted_domains": [".lg.jp", "city."], 
        "file_suffix": "homepage"
    },
]

In [14]:
def get_google_search_results(query, trusted_domains, num_results=10):
    """
    Google Custom Search APIを使用して検索し、
    優先ドメインを含むURLがあればそれを、なければ最上位を返す。
    """
    if not ENABLE_GOOGLE_SEARCH:
        print("Google Search API is disabled")
        return
    print(f"Searching API: {query} ...")
    
    try:
        service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
        
        # API Call
        res = service.cse().list(
            q=query,
            cx=CUSTOM_SEARCH_ENGINE_ID,
            lr='lang_ja',
            num=num_results
        ).execute()
        
        items = res.get("items", [])
        if not items:
            print("  -> No results found.")
            return None
            
        candidates = [item['link'] for item in items]
        
        # フィルタリングロジック
        # 1. trusted_domains に一致するものがあれば、その一番上を返す
        for url in candidates:
            for domain in trusted_domains:
                if domain in url:
                    return url
        
        # 2. なければトップを返す
        if candidates:
            return candidates[0]
            
    except Exception as e:
        print(f"API Error: {e}")
        return None
    
    return None

def search_city_urls():
    extracted_data = []
    for city in config.TARGET_CITIES:
        city_name = city["name"]
        
        for config in SEARCH_CONFIGS:
            pattern = config["pattern_suffix"]
            domains = config["trusted_domains"]
            
            full_query = f"{city_name} {pattern}"
            best_url = get_google_search_results(full_query, domains)
            
            entry = {
                "city_id": city["id"],
                "city_name": city_name,
                "search_pattern": pattern,
                "query": full_query,
                "url": best_url if best_url else "",
                "file_suffix": config["file_suffix"]
            }
            extracted_data.append(entry)
            print(f"  -> Selected: {best_url}")
            time.sleep(1)
    return extracted_data


if ENABLE_GOOGLE_SEARCH:
    print("検索を開始します...")
    search_city_urls()
    with open(JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=2, ensure_ascii=False)
    print(f"\n検索完了。結果を {JSON_PATH} に保存しました。")
else:
    print("Google Search API is disabled")

Google Search API is disabled


### スクレイピング

作成された `source_urls.json` を開いて確認し、必要に応じてURLを修正
確認が終わったら、データをダウンロード

In [15]:
# JSONの読み込みとデータ取得
if not os.path.exists(JSON_PATH):
    print(f"{JSON_PATH} が見つかりません。")
else:
    with open(JSON_PATH, "r", encoding="utf-8") as f:
        source_list = json.load(f)
    
    print(f"{len(source_list)} 件のURLを処理します...")
    
    for item in source_list:
        url = item.get("url")
        city_id = item.get("city_id")
        pattern = item.get("search_pattern")
        file_suffix = item.get("file_suffix", "misc")
        
        if not url:
            print(f"Skipping empty URL for {item['city_name']} - {pattern}")
            continue
            
        try:
            # 1. HTML取得
            html_content = get_html_content(url)
            
            # 2. 解析 & クリーニング
            soup = BeautifulSoup(html_content, "html.parser")
            soup = clean_html(soup)
            
            # 3. テーブル変換
            soup = convert_tables_to_markdown(soup)
            
            # 4. テキスト抽出
            text = soup.get_text(separator="\n", strip=True)
            
            # ファイル名生成: city_id + configで指定されたsuffix
            base_filename = f"{city_id}_{file_suffix}"
            filename = f"{base_filename}.txt"
            
            filepath = os.path.join(DATA_DIR, filename)
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(f"Source URL: {url}\n")
                f.write(f"Search Pattern: {pattern}\n")
                f.write("\n")
                f.write(text)
            
            print(f"  -> Saved to {filename}")
            
        except Exception as e:
            print(f"  -> Error fetching {url}: {e}")

6 件のURLを処理します...
  -> Loading from cache: ../data/raw_html/a005f9d3f508ffbd04e57bc895c4a653.html
  -> Saved to 13105_digital.txt
  -> Loading from cache: ../data/raw_html/a174e1bd681202caecdea4aa681657f5.html
  -> Saved to 13105_general.txt
  -> Loading from cache: ../data/raw_html/334f1ac7f7ede2c5f8ef7ef0f91dc5fa.html
  -> Saved to 13106_digital.txt
  -> Loading from cache: ../data/raw_html/e6ac298379a32f65611709afd65a98de.html
  -> Saved to 13106_general.txt
  -> Loading from cache: ../data/raw_html/12923589610933bd656fba75259a02f3.html
  -> Saved to 47201_digital.txt
  -> Loading from cache: ../data/raw_html/565a9af2a7bf989f8464eb050412ce28.html
  -> Saved to 47201_general.txt
