In [None]:
%pip install selenium==4.35.0 pandas==2.1.4 webdriver-manager==4.0.2 seleniumbase==4.41.1

In [None]:
import json
import time
import string
import datetime
import pandas as pd
from seleniumbase import Driver
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chromium.webdriver import ChromiumDriver
from selenium.webdriver.common.action_chains import ActionBuilder, ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.relative_locator import locate_with
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException


def setup_driver():
    """Initializes and returns a Selenium WebDriver."""
    # With default driver, you need to add wait.until before performing any interaction
    options = webdriver.ChromeOptions()
    options.add_argument('--headless=new')
    options.add_argument('--allow-running-insecure-content')
    options.add_argument("--allow-insecure-localhost");
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--disable-gpu')
    options.add_argument('--start-maximized')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
    options.add_experimental_option("useAutomationExtension", False) 

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") 

    return driver

def setup_driver_seleniumbase():
    """Initializes and returns a Selenium WebDriver. Automatically install binary Chromium."""
    driver = Driver(uc=True, headless=True)
    return driver

def setup_driver_firefox():
    """Initializes and returns a Selenium WebDriver."""
    # CAREFUL, action chains move doesn't work with this driver, use action builder for moving instead.
    # Set up Firefox options for headless mode
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--start-maximized')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("window-size=1920,1080") # Set a larger window size
    
    # # Additional options for WSL2 compatibility
    options.set_preference("dom.webdriver.enabled", False)
    options.set_preference('useAutomationExtension', False)
    
    firefox_profile = FirefoxProfile()
    options.profile = firefox_profile

    driver = webdriver.Firefox(options=options)
    
    return driver

def main():
    """Main function to run the scraper."""
    base_url = "https://e-fornas.kemkes.go.id/guest/daftar-obat"
    driver = setup_driver_seleniumbase()

    try:
        driver.uc_open_with_reconnect(base_url) # Only for seleniumbase, use driver.get for default driver
        wait = WebDriverWait(driver, 20)
        actions = ActionChains(driver)
        actions_builder = ActionBuilder(driver)

        letter_button_container = driver.find_element(By.CSS_SELECTOR, "div.w-full.flex.gap-3.justify-between")
        letter_buttons = letter_button_container.find_elements(By.TAG_NAME, "button")

        # Extract the letters from the buttons to iterate through
        letters = [btn for btn in letter_buttons if len(btn.text) == 1 and btn.text.isalpha()]

        for letter in letters:

            if letter.text not in ["Z"]:
                continue

            print(f"\n======================================")
            print(f"Scraping drugs for letter: '{letter.text}'")
            print(f"======================================")
            
            drugs_details_list = []

            actions.move_to_element(letter).pause(0.2).click().pause(1).perform()
            print(f"Page found for letter '{letter}.")

            drugs = driver.find_elements(By.XPATH, "/html/body/div[1]/div[1]/div/div[2]/div[2]/div/div/div")
            
            for drug in drugs:

                try:
                    subdrug_list = []

                    actions.pause(0.2).move_to_element(drug).pause(0.2).click(drug).pause(1).perform()
                    drug_name = drug.text
                    print(f"Drug Name: '{drug_name}' and the clicked!")
                
                    subdrugs_t = drug.find_elements(By.XPATH, '/html/body/div[2]/div/div[3]/div/div[2]/div[2]/div')

                    # Only take subdrug contain a word
                    subdrugs = [subdrug for subdrug in subdrugs_t if len(subdrug.text) > 0]

                    for subdrug in subdrugs:

                        try:
                            actions.pause(0.5).move_to_element(subdrug).pause(2).click(subdrug).pause(1).perform()
                            print(f"Subdrug {subdrug.text} clicked, table shown.")

                            while True:
                                subdrug_details_list = {"name": subdrug.text}
                                rows_subdrug = subdrug.find_elements(By.XPATH, '/html/body/div[3]/div/div[3]/div/div[2]/div[2]/li')

                                if len(rows_subdrug) == 0:
                                    print("Table is empty")
                                    subdrug_list.append(subdrug_details_list)
                                    break;

                                for row_t in rows_subdrug:
                                    try:
                                        key_element = row_t.find_element(By.XPATH, "(.//span)[1]")
                                        value_element = row_t.find_element(By.XPATH, "(.//span)[2]")

                                        key = key_element.text.strip().lower()
                                        value = value_element.text.strip()

                                        # Return True if the element is a checkbox and is checked
                                        if not value:  
                                            try:
                                                label = value_element.find_element(By.TAG_NAME, "label")
                                                value_cls = label.get_attribute("class")
                                                if "ant-checkbox-wrapper-checked" in value_cls:
                                                    value = True
                                                else:
                                                    print(f"Label is not a checkbox or is not checked.")
                                            except NoSuchElementException or TimeoutException:
                                                print(f"Label is not found, therefore skipped.")
                                                pass;

                                        subdrug_details_list[key] = value
                                    except Exception as e:
                                        print(f"Row {row_t.text} might get skipped with following error: {e}")
                                        pass;
                                
                                subdrug_list.append(subdrug_details_list)

                                # Check if the next button is disabled then break, otherwise click to next page
                                next_btn = drug.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div/div[2]/div[2]/div/ul/li[contains(@class, "ant-pagination-next")]')
                                next_cls = next_btn.get_attribute("class")
                                is_disabled = 'ant-pagination-disabled' in next_cls
                                if is_disabled:
                                    print("Next button is disabled. Last page reached.")
                                    # Reset the pagination back to 1, otherwise render is supressed resulting blank page
                                    while True:
                                        back_btn = drug.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div/div[2]/div[2]/div/ul/li[contains(@class, "ant-pagination-prev")]')
                                        back_cls = back_btn.get_attribute("class")
                                        is_prev_disabled = 'ant-pagination-disabled' in back_cls
                                        if is_prev_disabled:
                                            print("Back button is disabled. First page reached.")
                                            break;
                                        else:
                                            actions.scroll_to_element(back_btn).pause(0.2).click(back_btn).pause(1).perform()
                                            print("Back button is clicked.")
                                    break
                                actions.scroll_to_element(next_btn).pause(0.2).click(next_btn).pause(1).perform()
                                print("Next button is clicked.")

                            # Scroll to top and close 2nd sidebar
                            driver.execute_script("window.scrollTo(0, 0);")
                            close_btn_subdrug_detail = drug.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div/div[1]/div/button[contains(@class, "ant-drawer-close")]')
                            actions.scroll_to_element(close_btn_subdrug_detail).pause(0.2).click(close_btn_subdrug_detail).pause(1).perform()
                            print("2nd Sidebar is confirmed closed. Proceeding.")
                        except Exception as e :
                            print(f"Subdrug {subdrug.text} might get skipped with following error: {e}")
                            pass

                    drugs_details_list.append({ "name": drug_name, "subdrugs": subdrug_list })

                    # Scroll to top and close 1st sidebar
                    driver.execute_script("window.scrollTo(0, 0);")
                    close_button_subdrug = drug.find_element(By.XPATH, '/html/body/div[2]/div/div[3]/div/div[1]/div/button')
                    actions.scroll_to_element(close_button_subdrug).pause(0.2).click(close_button_subdrug).pause(1).perform()
                    print("1st Sidebar is confirmed closed. Proceeding.")

                except Exception as e:
                    print(f"Drug {drug.text} might get skipped with following error: {e}")
                    pass

            # Export data to JSON
            # if drugs_details_list:
            #     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            #     with open(f'fornas_drug_data_{letter.text}_{timestamp}.json', 'w', encoding='utf-8') as f:
            #         json.dump(drugs_details_list, f, ensure_ascii=False, indent=4)
            # else:
            #     print("No data was scraped. No files were created.")

    finally:
        # Ensure the driver is closed even if errors occur
        driver.quit()

if __name__ == "__main__":
    main()


Scraping drugs for letter: 'Z'
Page found for letter '<seleniumbase.undetected.webelement.WebElement (session="4833c12937a780ffaf460e491c072fb6", element="f.40FD17CC08D2EF29984CAC6E7D5C250E.d.E08107549B4C5DD0626D39BEF2253390.e.54")>.
Drug Name: 'zidovudin' and the clicked!
Subdrug: zidovudin - KAPSUL 100 MILIGRAM
Subdrug zidovudin - KAPSUL 100 MILIGRAM clicked, table shown.
Label is not found, therefore skipped.
Next button is disabled. Last page reached.
Back button is disabled. First page reached.
2nd Sidebar is confirmed closed. Proceeding.
Subdrug: zidovudin - SIRUP 50 MILIGRAM / 5 MILILITER
Subdrug zidovudin - SIRUP 50 MILIGRAM / 5 MILILITER clicked, table shown.
Label is not found, therefore skipped.
Next button is disabled. Last page reached.
Back button is disabled. First page reached.
2nd Sidebar is confirmed closed. Proceeding.
1st Sidebar is confirmed closed. Proceeding.
Drug Name: 'zinc' and the clicked!
Subdrug: zinc - DROPS 10 MILIGRAM / MILILITER
Subdrug zinc - DROPS 10

If you run the cell above, you will run into sort of warning (error but skipped for now), the problems as follow:
- Some click on subdrug div is not working, resulting detail subdrug sidepanel to not open, therefore failed to find back/close button because the whole panel is not exist to begin with. [X]
- Detail Subdrug has pagination, if moving the page and not reset it to the 1st page, the next subdrug will have blank page. [V] 

Some of my failed attempts to fix:
- With normal Selenium, I use explicit wait with expected conditions.
- Use thread sleep.
- Changing Chromium Driver (Selenium or Chrome) to Firefox (Gecko Driver).
- Playing around silent driver, in case the site implement anti-bot. Including arguments or options driver.
- Default Click, Action Chain, Action Builder, UC Click.
- Playing around selector.
- Incognito & Ad-blocker UC-only args.

In [None]:
import glob
import json
import pandas as pd
import os

def split_and_export_missing_data(
    json_pattern="fornas_drug_data_*.json",
    output_json="data.json",
    missing_csv="missing_data.csv"
):
    """
    Reads all JSON files matching the pattern, and splits the data into complete and missing subdrugs.
    Writes the complete data to output_json and the missing data to missing_csv.
    """
    json_files = glob.glob(json_pattern)
    complete_drugs = []
    missing_rows = []

    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except Exception as e:
                print(f"Error loading {file}: {e}")
                continue
            for drug in data:
                drug_name = drug.get("name", "")
                subdrugs = drug.get("subdrugs", [])
                complete_subdrugs = []
                for subdrug in subdrugs:
                    # Only has 'name' or is not a dict: considered missing
                    if not isinstance(subdrug, dict) or (list(subdrug.keys()) == ["name"]):
                        # For CSV: drug_name, subdrug_name
                        subdrug_name = subdrug["name"] if isinstance(subdrug, dict) else str(subdrug)
                        missing_rows.append({"drug_name": drug_name, "subdrug_name": subdrug_name})
                    else:
                        complete_subdrugs.append(subdrug)
                if complete_subdrugs:
                    complete_drugs.append({"name": drug_name, "subdrugs": complete_subdrugs})

    # Sort complete_drugs by drug name, and subdrugs by subdrug name
    complete_drugs.sort(key=lambda d: d["name"].lower())
    for drug in complete_drugs:
        drug["subdrugs"].sort(key=lambda s: s.get("name", "").lower())

    # Export complete data to JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(complete_drugs, f, ensure_ascii=False, indent=4)
    print(f"Exported {len(complete_drugs)} drugs with complete subdrugs to {output_json}")

    # Export missing data to CSV, sorted by drug_name then subdrug_name
    if missing_rows:
        df_missing = pd.DataFrame(missing_rows)
        df_missing = df_missing.sort_values(by=["drug_name", "subdrug_name"], key=lambda col: col.str.lower())
        df_missing.to_csv(missing_csv, index=False, encoding="utf-8")
        print(f"Exported {len(df_missing)} missing subdrugs to {missing_csv}")
    else:
        print("No missing subdrugs found.")

def export_csv_to_fornas_json(input_csv="fornas_drug_data_merged.csv", output_json="fornas_drug_data_exported.json"):
    """
    Reads the edited CSV and exports it back to a JSON format similar to the original fornas drug data structure.
    The structure will be:
    [
        {
            "name": <drug_name>,
            "subdrugs": [
                { ...subdrug fields... },
                ...
            ]
        },
        ...
    ]
    """
    if not os.path.exists(input_csv):
        print(f"Input CSV file '{input_csv}' not found.")
        return

    df = pd.read_csv(input_csv, dtype=str).fillna("")
    # Group by drug_name, collect subdrugs as dicts (excluding drug_name)
    drugs_dict = {}
    for _, row in df.iterrows():
        drug_name = row.get("drug_name", "")
        # Remove 'drug_name' from subdrug fields
        subdrug = {k: v for k, v in row.items() if k != "drug_name" and v != ""}
        if drug_name not in drugs_dict:
            drugs_dict[drug_name] = []
        drugs_dict[drug_name].append(subdrug)

    # Build the list structure
    drugs_list = []
    for drug_name, subdrugs in drugs_dict.items():
        drugs_list.append({
            "name": drug_name,
            "subdrugs": subdrugs
        })

    # Write to JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(drugs_list, f, ensure_ascii=False, indent=4)
    print(f"Exported {len(drugs_list)} drugs to {output_json}")


I export the missing data into CSV format for easier manual editing.

The flow goes like this:
- Take .CSV into Google Spreadsheet
- Copy the row from column 'drug' and 'subdrug' into Gemini Chat
- Copy the data from referenced site, then turn it into JSON format
- Collect it as Array of Object (missing_data_halfway.json)
- Reconstruct flat subdrug into drug-subdrug array
- Combine missing array and previous data.json into final_drugs.json
- Copy it to /data/drugs.json

In [None]:
split_and_export_missing_data()

Exported 604 drugs with complete subdrugs to data.json
Exported 129 missing subdrugs to missing_data.csv


In [None]:
def reconstruct_drugs_json_from_file(json_path):
    """
    Reads a flat subdrug JSON file (e.g. missing_data_halfway.json), reconstructs into nested structure grouped by drug name,
    writes to a new JSON file with 'processed_' prefix, and returns the output file name.
    Args:
        json_path (str): Path to the flat JSON file.
    Returns:
        str: Output JSON file name.
    """
    import json
    import os
    from collections import defaultdict

    with open(json_path, "r", encoding="utf-8") as f:
        flat_json = json.load(f)

    grouped = defaultdict(list)
    for entry in flat_json:
        name_field = entry.get("name", "")
        drug_name = name_field.split(" - ")[0].strip()
        grouped[drug_name].append(entry)

    drugs_list = [{"name": drug_name, "subdrugs": subdrugs} for drug_name, subdrugs in grouped.items()]

    base = os.path.basename(json_path)
    output_name = f"processed_{base}"
    with open(output_name, "w", encoding="utf-8") as f:
        json.dump(drugs_list, f, ensure_ascii=False, indent=4)
    return output_name

reconstruct_drugs_json_from_file("missing_data_halfway.json")

'processed_missing_data_halfway.json'

In [4]:
def join_processed_json_files(input_files, output_file):
    """
    Joins multiple processed drug JSON files (with schema: [{"name": ..., "subdrugs": [...]}, ...])
    into a single JSON file with the same schema. If a drug name appears in multiple files,
    their subdrugs are concatenated. The result is sorted by drug name (A-Z).

    Args:
        input_files (list of str): List of input JSON file paths.
        output_file (str): Output JSON file path.

    Returns:
        str: Output file name.
    """
    import json
    from collections import defaultdict

    drugs_dict = defaultdict(list)

    for file in input_files:
        with open(file, "r", encoding="utf-8") as f:
            drugs = json.load(f)
            for drug in drugs:
                name = drug["name"]
                drugs_dict[name].extend(drug.get("subdrugs", []))

    merged = [{"name": name, "subdrugs": subdrugs} for name, subdrugs in sorted(drugs_dict.items(), key=lambda x: x[0].lower())]

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=4)

    return output_file

# Example usage:
join_processed_json_files(
    ["processed_missing_data_halfway.json", "data.json"],
    "final_drugs.json"
)


'final_drugs.json'