In [1]:
import json
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [2]:
# URL and file for saving output
url = "https://ura.go.ug/"
json_export = 'ura_faqs.json'

In [11]:
def init_webdriver(headless=True):
    """
    Initialize the Selenium WebDriver with optional headless mode.
    """
    print("Initializing WebDriver.")
    options = Options()
    if headless:
        options.add_argument('--headless') 
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')  
        options.add_argument('--remote-debugging-port=9222')  
        options.add_argument('--disable-extensions')
        options.add_argument('--disable-infobars')
        options.add_argument('--disable-browser-side-navigation')
        options.add_argument('--disable-features=VizDisplayCompositor')
        options.add_argument('--window-size=1920,1080')  
        options.add_argument('--disable-setuid-sandbox')
    try:
        service = Service(verbose=True)
        driver = webdriver.Chrome(service=service, options=options)
        print("WebDriver initialized successfully.")
        return driver
    except WebDriverException as e:
        print(f"Failed to initialize WebDriver: {e}")
        return None

In [4]:
def clean_section_name(section_name):
    """
    Remove unwanted characters and whitespaces
    """
    cleaned_name = re.sub(r"[«»]", "", section_name).strip()
    cleaned_name = re.sub(r"\s+", " ", cleaned_name)
    return cleaned_name
    

In [5]:
# FAQ dropdown button manipulator
def click_faq_dropdown(driver):
    try:
        print(f"Opening URL: {url}")
        driver.get(url)

        wait = WebDriverWait(driver, 10)

        # Wait for FAQ button to be clickable
        faq_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'dropdown-toggle') and contains(., 'FAQs')]"))
        )
        print("Clicking FAQ dropdown button.")
        faq_button.click()

        # dropdown menu is visible
        wait.until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, ".faqs-global-sec-menu-items"))
        )
        
        print("FAQ dropdown displayed successfully.")

    except Exception as e:
        print(f"Failed to click FAQ dropdown: {e}")
        raise


In [6]:
# Extract FAQ links
def extract_faq_links(driver):
    try:
        print("Extracting FAQ section links from the dropdown.")
        # Trace top-level drop down items
        dropdown_menu = driver.find_element(By.CSS_SELECTOR, ".faqs-global-sec-menu-items")
        faq_items = dropdown_menu.find_elements(By.XPATH, "./li")

        # Extract links and section names
        faq_urls = []
        for item in faq_items:
            try:
                link = item.find_element(By.TAG_NAME, 'a')
                raw_section_name = link.text.strip()
                section_url = link.get_attribute('href')

                if link.get_attribute('data-bs-toggle') == 'dropdown':
                    parent_section_name = clean_section_name(raw_section_name)
                    print(f"Found parent section: '{parent_section_name}' with URL: {section_url}")

                    # Find the nested submenu
                    try:
                        submenu = item.find_element(By.CSS_SELECTOR, "ul.submenu")
                        submenu_links = submenu.find_elements(By.TAG_NAME, 'a')
                        for submenu_link in submenu_links:
                            submenu_section_name = parent_section_name
                            submenu_url = submenu_link.get_attribute('href')
                            faq_urls.append({"url": submenu_url, "section": submenu_section_name})
                            print(f"Found subsection: '{submenu_section_name}' with URL: {submenu_url}")
                    except Exception as e:
                        print(f"No submenu found for parent section: '{parent_section_name}' - {e}")
                else:
                    # It's an actual FAQ link
                    cleaned_section_name = clean_section_name(raw_section_name)
                    if cleaned_section_name:  
                        faq_urls.append({"url": section_url, "section": cleaned_section_name})
                        print(f"Found section: '{cleaned_section_name}' with URL: {section_url}")
                    else:
                        print(f"Found section link with empty name - URL: {section_url}")
            except Exception as e:
                print(f"Failed to process a dropdown item - {e}")

        print(f"Total FAQ sections found: {len(faq_urls)}")
        return faq_urls

    except Exception as e:
        print(f"Failed to extract FAQ links: {e}")
        raise

In [7]:
# Extract FAQs from a single page
def extract_faqs_from_page(driver, page_url, section_name):
    try:
        print(f"Extracting FAQs from section: {section_name} - URL: {page_url}")
        driver.get(page_url)

        wait = WebDriverWait(driver, 10)
        wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'accordion-header'))
        )
        
        time.sleep(3)

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        crude_faqs = []
        headings = soup.find_all('h2', {'class': 'accordion-header'})

        for heading in headings:
            if 'flush' in heading.get('id',''):
                question = heading.find('button', class_='accordion-button').get_text(strip=True)

                question_cleaned = re.sub(r"^\d+\.?\s*", "", question)
                
                collapse_id = heading.find('button')['data-bs-target'].replace('#', '')
                answer_div = soup.find('div', {'id': collapse_id})

                if answer_div:
                    answer = answer_div.find('div', class_='accordion-body').get_text(strip=True)
                    faqs.append({"Question": question_cleaned, "Answer": answer, "Section": section_name})
                    print(f"Extracted FAQ - Question: {question_cleaned}, Section: {section_name}")
        
        faqs = []

        for faq in crude_faqs:
            if faq['question'] == '' and faq['answer'] == '':
                continue
            faqs.append(faq)

        print(f"Extracted {len(faqs)} FAQs from section: {section_name}")
        return faqs

    except Exception as e:
        print(f"Failed to extract FAQs from {page_url}: {e}")
        raise


In [8]:
# Save the extracted FAQs to a JSON file
def save_faqs_to_json(faqs, filename=json_export):
    try:
        print(f"Saving {len(faqs)} FAQs to JSON file: {filename}")
        with open(filename, 'w') as file:
            json.dump(faqs, file, indent=4)
        print(f"FAQs successfully saved to {filename}")
    except Exception as e:
        print(f"Failed to save FAQs to JSON: {e}")
        raise

In [12]:
# Main function to scrape all FAQs
def scrape_all_faqs(headless=True):
    print("Starting FAQ scraping process.")
    driver=None
    try:
        driver = init_webdriver(headless=headless)
        click_faq_dropdown(driver)
        faq_urls = extract_faq_links(driver)
        print(f"Collected {len(faq_urls)} FAQ URLs.")

        all_faqs = []
        for page in faq_urls:
            faqs = extract_faqs_from_page(driver, page["url"], page["section"])
            all_faqs.extend(faqs)

        save_faqs_to_json(all_faqs)

    except Exception as e:
        print(f"Error during FAQ scraping: {e}")
    finally:
        if driver:
            driver.quit()
            print("WebDriver closed.")
        else:
            print("WebDriver was not initialized; skipping quit.")

In [13]:
scrape_all_faqs()

Starting FAQ scraping process.
Initializing WebDriver.
Failed to initialize WebDriver: Message: session not created: Chrome failed to start: exited normally.
  (chrome not reachable)
  (The process started from chrome location /home/daniel/.cache/selenium/chrome/linux64/129.0.6668.58/chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x60186067713a <unknown>
#1 0x60186035d5e0 <unknown>
#2 0x601860395921 <unknown>
#3 0x6018603912c5 <unknown>
#4 0x6018603dddf6 <unknown>
#5 0x6018603dd446 <unknown>
#6 0x6018603d18c3 <unknown>
#7 0x60186039f6b3 <unknown>
#8 0x6018603a068e <unknown>
#9 0x601860641b3b <unknown>
#10 0x601860645ac1 <unknown>
#11 0x60186062e335 <unknown>
#12 0x601860646642 <unknown>
#13 0x60186061349f <unknown>
#14 0x601860666038 <unknown>
#15 0x601860666203 <unknown>
#16 0x601860675f8c <unknown>
#17 0x78f541e9ca94 <unknown>
#18 0x78f541f29c3c <unknown>

Opening URL: https://ura.go.ug/
Failed to click FAQ dropdown: 'NoneType' objec