In [None]:
!pip install -r requirements.txt

import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

# ----------------------------
# Configuration
# ----------------------------

BASE_URL = "https://ndhrhis.doh.gov.ph"
HTML_DIR = "complete_html"
OUT_DIR = "complete_csv"
os.makedirs(OUT_DIR, exist_ok=True)

YEAR_CONFIG = {
    2024: ('02', 'As of December 31, 2024', '2025-01-03'),
    2023: ('02', 'As of December 2023', '2024-01-02'),
    2022: ('02', 'As of December 2022', '2023-01-04'),
    2021: ('03', 'As of December 2021', '2022-01-03'),
    2020: ('01', 'As of December 31, 2020', '2020-07-22'),
    2019: ('01', 'As Of December 31, 2019', '2020-01-24'), # capital "Of"
    2018: ('01', 'As of December 31, 2018', '2018-09-07'),
    2017: ('03', 'As of December 31, 2017 - Third set of test data', '2018-06-10'),
}

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://ndhrhis.doh.gov.ph/RPA0001b.php",
    "Origin": "https://ndhrhis.doh.gov.ph",
    "Content-Type": "application/x-www-form-urlencoded"
}

# ----------------------------
# Helpers
# ----------------------------

def extract_dropdown_values(soup):
    sel = soup.find("select", attrs={"name": "ddparams"})
    if not sel:
        return []
    options = []
    for opt in sel.find_all("option"):
        value = opt.get("value", "").strip()
        label = opt.text.strip()
        if value and label and value.lower() != 'null':
            options.append((value, label))
    return options

def build_post_url(level, year):
    seqn, title, gdate = YEAR_CONFIG[year]
    return (f"{BASE_URL}/system.bcall.page.php?xcrs=RPA0001b.php&prm="
            f"level={level}^year={year}^seqn={seqn}^title={title}^gdate={gdate}^"
            "allfltr=0^prvslct=A^prvlist=^sbrep=A%20B%20C%20D%20E")

def get_html_for_year(year):
    path = os.path.join(HTML_DIR, f"Distribution-Nationwide {year}.html")
    with open(path, encoding="utf-8") as f:
        return BeautifulSoup(f.read(), "lxml")

def submit_and_parse(session, url, ddvalue):
    time.sleep(0.01)
    resp = session.post(url, headers=HEADERS, data={"ddparams": ddvalue, "submit": "Submit"})
    resp.raise_for_status()
    time.sleep(0.01)
    return BeautifulSoup(resp.text, "lxml")

def sanitize_filename(name):
    return re.sub(r'[^\w\s-]', '', name).replace(' ', '_')

def extract_and_save_tables(soup, outdir, place_name, year, level):
    os.makedirs(outdir, exist_ok=True)
    for table in soup.find_all("table", class_="RepT"):
        table_id = table.get("id", "")
        match = re.match(r"treport([A-Z])", table_id)
        if not match:
            continue
        category_letter = match.group(1)

        rows = table.find_all("tr")
        data = [[cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
                for row in rows if row.find_all(["td", "th"])]
        if len(data) < 2:
            continue
        df = pd.DataFrame(data[1:], columns=data[0])
        fname = f"{sanitize_filename(place_name)}_TABLE_{category_letter}.csv"
        df.to_csv(os.path.join(outdir, fname), index=False)
        print(f"✅ Saved CSV: {year}/{level}/{fname}")

# ----------------------------
# Main Routine per Year
# ----------------------------

def process_year(year):
    session = requests.Session()
    try:
        print(f"\n🔍 Processing year: {year}")
        entry_soup = get_html_for_year(year)
        region_vals = extract_dropdown_values(entry_soup)

        for region_val, region_label in region_vals:
            try:
                url_region = build_post_url(level=2, year=year)
                region_soup = submit_and_parse(session, url_region, region_val)

                region_dir = os.path.join(OUT_DIR, str(year), sanitize_filename(region_label))
                extract_and_save_tables(region_soup, region_dir, region_label, year, "Region")

                province_vals = extract_dropdown_values(region_soup)
                for province_val, province_label in province_vals:
                    try:
                        url_prov = build_post_url(level=3, year=year)
                        province_soup = submit_and_parse(session, url_prov, province_val)

                        province_dir = os.path.join(region_dir, sanitize_filename(province_label))
                        extract_and_save_tables(province_soup, province_dir, province_label, year, "Province")

                        muni_vals = extract_dropdown_values(province_soup)
                        for muni_val, muni_label in muni_vals:
                            try:
                                url_muni = build_post_url(level=4, year=year)
                                muni_soup = submit_and_parse(session, url_muni, muni_val)

                                muni_dir = os.path.join(province_dir, sanitize_filename(muni_label))
                                extract_and_save_tables(muni_soup, muni_dir, muni_label, year, "Municipality")
                            except Exception as e:
                                print(f"❌ Error in municipality {muni_label}: {e}")
                    except Exception as e:
                        print(f"❌ Error in province {province_label}: {e}")
            except Exception as e:
                print(f"❌ Error in region {region_label}: {e}")
    except Exception as e:
        print(f"❌ Could not initialize year {year}: {e}")

# ----------------------------
# Entry Point
# ----------------------------

if __name__ == "__main__":
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(process_year, year) for year in sorted(YEAR_CONFIG)]
        for future in as_completed(futures):
            future.result()
